diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/logbooks.py | 19 | ||||
-rw-r--r-- | parsers/people.py | 4 |
2 files changed, 11 insertions, 12 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index ffd8e21..fbe00a3 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -115,7 +115,7 @@ def ParseDate(tripdate, year): assert False, tripdate return datetime.date(year, month, day) -# 2007, 2008, 2006 +# 2006, 2008 - 2010 def Parselogwikitxt(year, expedition, txt): trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: @@ -140,9 +140,9 @@ def Parselogwikitxt(year, expedition, txt): #print "\n", tripcave, "--- ppp", trippeople, len(triptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -# 2002, 2004, 2005 +# 2002, 2004, 2005, 2007, 2011 - 2018 def Parseloghtmltxt(year, expedition, txt): - print(" - Using log html parser") + #print(" - Starting log html parser") tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) logbook_entry_count = 0 for trippara in tripparas: @@ -163,7 +163,6 @@ def Parseloghtmltxt(year, expedition, txt): print("can't parse: ", trippara) # this is 2007 which needs editing #assert s, trippara continue - tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() ldate = ParseDate(tripdate.strip(), year) #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) @@ -174,7 +173,7 @@ def Parseloghtmltxt(year, expedition, txt): tripcave = triptitles[0] else: tripcave = "UNKNOWN" - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + #print("\n", tripcave, "--- ppp", trippeople, len(triptext)) ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() @@ -183,7 +182,7 @@ def Parseloghtmltxt(year, expedition, txt): print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") -# main parser for pre-2001. simpler because the data has been hacked so much to fit it +# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: @@ -229,7 +228,7 @@ def Parseloghtml01(year, expedition, txt): # could includ the tripid (url link for cross referencing) EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - +# parser for 2003 def Parseloghtml03(year, expedition, txt): tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: @@ -281,8 +280,7 @@ def SetDatesFromLogbookEntries(expedition): def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ - expowebbase = os.path.join(settings.EXPOWEB, "years") - #year = str(expedition.year) + expowebbase = os.path.join(settings.EXPOWEB, "years") yearlinks = settings.LOGBOOK_PARSER_SETTINGS logbook_parseable = False @@ -294,6 +292,7 @@ def LoadLogbookForExpedition(expedition): file_in.close() parsefunc = year_settings[1] logbook_parseable = True + print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]) else: try: file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) @@ -304,7 +303,7 @@ def LoadLogbookForExpedition(expedition): parsefunc = settings.DEFAULT_LOGBOOK_PARSER except (IOError): logbook_parseable = False - print("Couldn't open default logbook file and nothing set for expo " + expedition.year) + print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year) if logbook_parseable: parser = globals()[parsefunc] diff --git a/parsers/people.py b/parsers/people.py index 3c3fc03..48d6c17 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -67,8 +67,8 @@ def LoadPersonsExpos(): for personline in personreader: name = personline[header["Name"]] - name = re.sub("<.*?>", "", name) - mname = re.match("(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name) + name = re.sub(r"<.*?>", "", name) + mname = re.match(r"(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name) nickname = mname.group(3) or "" lookupAttribs={'first_name':mname.group(1), 'last_name':(mname.group(2) or "")} |