From a22b42e83267ee0120a6347d6f72af4fc7c0bd1b Mon Sep 17 00:00:00 2001
From: Sam Wenham )? # second date
+ s = re.match(r'''(?x)(?:\s* )? # second date
\s*(?:\s*)?
\s* )?
\s*
([\s\S]*?)(?=
([\s\S]*?)(?=
.*?\s*
", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"
", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"", "\n\n", ltriptext).strip() EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + if logbook_entry_count == 0: + print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") # main parser for pre-2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): - tripparas = re.findall("
]*>(T/?U.*)', triptext) + mtu = re.search(r'
]*>(T/?U.*)', triptext) if mtu: tu = mtu.group(1) triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] @@ -206,17 +212,17 @@ def Parseloghtml01(year, expedition, txt): ltriptext = triptext - mtail = re.search('(?:[^<]*|\s|/|-|&|?p>|\((?:same day|\d+)\))*$', ltriptext) + mtail = re.search(r'(?:[^<]*|\s|/|-|&|?p>|\((?:same day|\d+)\))*$', ltriptext) if mtail: #print mtail.group(0) ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub("
", "", ltriptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("|
", "\n\n", ltriptext).strip()
+ ltriptext = re.sub(r"
|
", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
- ltriptext = re.sub("?u>", "_", ltriptext)
- ltriptext = re.sub("?i>", "''", ltriptext)
- ltriptext = re.sub("?b>", "'''", ltriptext)
+ ltriptext = re.sub(r"?u>", "_", ltriptext)
+ ltriptext = re.sub(r"?i>", "''", ltriptext)
+ ltriptext = re.sub(r"?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
@@ -225,19 +231,19 @@ def Parseloghtml01(year, expedition, txt):
def Parseloghtml03(year, expedition, txt):
- tripparas = re.findall("
", "\n\n", ltriptext).strip() - ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) + ltriptext = re.sub(r"
", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -yearlinks = [ -# ("2013", "2013/logbook.html", Parseloghtmltxt), - ("2012", "2012/logbook.html", Parseloghtmltxt), - ("2011", "2011/logbook.html", Parseloghtmltxt), - ("2010", "2010/logbook.html", Parselogwikitxt), - ("2009", "2009/2009logbook.txt", Parselogwikitxt), - ("2008", "2008/2008logbook.txt", Parselogwikitxt), - ("2007", "2007/logbook.html", Parseloghtmltxt), - ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), - ("2005", "2005/logbook.html", Parseloghtmltxt), - ("2004", "2004/logbook.html", Parseloghtmltxt), - ("2003", "2003/logbook.html", Parseloghtml03), - ("2002", "2002/logbook.html", Parseloghtmltxt), - ("2001", "2001/log.htm", Parseloghtml01), - ("2000", "2000/log.htm", Parseloghtml01), - ("1999", "1999/log.htm", Parseloghtml01), - ("1998", "1998/log.htm", Parseloghtml01), - ("1997", "1997/log.htm", Parseloghtml01), - ("1996", "1996/log.htm", Parseloghtml01), - ("1995", "1995/log.htm", Parseloghtml01), - ("1994", "1994/log.htm", Parseloghtml01), - ("1993", "1993/log.htm", Parseloghtml01), - ("1992", "1992/log.htm", Parseloghtml01), - ("1991", "1991/log.htm", Parseloghtml01), - ] def SetDatesFromLogbookEntries(expedition): """ @@ -295,23 +276,41 @@ def SetDatesFromLogbookEntries(expedition): persontrip.persontrip_next = None lprevpersontrip = persontrip persontrip.save() - - - + + def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ expowebbase = os.path.join(settings.EXPOWEB, "years") year = str(expedition.year) - for lyear, lloc, parsefunc in yearlinks: - if lyear == year: - break - fin = open(os.path.join(expowebbase, lloc)) - print "opennning", lloc - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) + yearlinks = settings.LOGBOOK_PARSER_SETTINGS + + logbook_parseable = False + + if expedition.year in yearlinks: + year_settings = yearlinks[expedition.year] + file_in = open(os.path.join(expowebbase, year_settings[0])) + txt = file_in.read().decode("latin1") + file_in.close() + parsefunc = year_settings[1] + logbook_parseable = True + else: + try: + file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + print("No set parser found using default") + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + except (IOError): + logbook_parseable = False + print("Couldn't open default logbook file and nothing set for expo " + expo.year) + + if logbook_parseable: + parser = globals()[parsefunc] + parser(expedition.year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) @@ -324,25 +323,49 @@ def LoadLogbooks(): #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite - for year, lloc, parsefunc in yearlinks: - # This will not work until the corresponding year exists in the database. - # In 2012 this needed noscript/folk.csv to be updated first. - expedition = models.Expedition.objects.filter(year = year)[0] - fin = open(os.path.join(expowebbase, lloc)) - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) + yearlinks = settings.LOGBOOK_PARSER_SETTINGS -dateRegex = re.compile('(\d\d\d\d)-(\d\d)-(\d\d)', re.S) -expeditionYearRegex = re.compile('(.*?)', re.S) -titleRegex = re.compile('