diff options
author | Wookey <wookey@wookware.org> | 2019-04-02 00:57:54 +0100 |
---|---|---|
committer | Wookey <wookey@wookware.org> | 2019-04-02 00:57:54 +0100 |
commit | c4301cf6df56ba1bef4f2c908b949a2b45ea65dc (patch) | |
tree | 9c6bb4a4530824c8e072984a0346509298188030 /parsers/logbooks.py | |
parent | de7d68b1eb70542f66092cb0048af3d096e6980c (diff) | |
parent | bb8dbb381fe87c3a63e9586a1bf1e993b09c965b (diff) | |
download | troggle-c4301cf6df56ba1bef4f2c908b949a2b45ea65dc.tar.gz troggle-c4301cf6df56ba1bef4f2c908b949a2b45ea65dc.tar.bz2 troggle-c4301cf6df56ba1bef4f2c908b949a2b45ea65dc.zip |
Merge lots of troggle fixes
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 47 |
1 files changed, 26 insertions, 21 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index ffd8e21..cecbdb3 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -45,7 +45,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground): author = res[-1][0] return res, author -def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... +def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... # print "Getting cave for " , place try: katastNumRes=[] @@ -74,23 +74,23 @@ def GetTripCave(place): #need to be fuzzier about matching h noncaveplaces = [ "Journey", "Loser Plateau" ] -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): """ saves a logbook entry and related persontrips """ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) if not author: - print(" - skipping logentry" + title + " no author for entry") + print(" - Skipping logentry: " + title + " no author for entry") return - -# tripCave = GetTripCave(place) - # + + #tripCave = GetTripCave(place) + lplace = place.lower() if lplace not in noncaveplaces: cave=GetCaveLookup().get(lplace) #Check for an existing copy of the current entry, and save expeditionday = expedition.get_expedition_day(date) - lookupAttribs={'date':date, 'title':title} - nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]} + lookupAttribs={'date':date, 'title':title} + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type} lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) for tripperson, time_underground in trippersons: @@ -115,7 +115,7 @@ def ParseDate(tripdate, year): assert False, tripdate return datetime.date(year, month, day) -# 2007, 2008, 2006 +# 2006, 2008 - 2010 def Parselogwikitxt(year, expedition, txt): trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: @@ -140,9 +140,9 @@ def Parselogwikitxt(year, expedition, txt): #print "\n", tripcave, "--- ppp", trippeople, len(triptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -# 2002, 2004, 2005 +# 2002, 2004, 2005, 2007, 2011 - 2018 def Parseloghtmltxt(year, expedition, txt): - print(" - Using log html parser") + #print(" - Starting log html parser") tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) logbook_entry_count = 0 for trippara in tripparas: @@ -163,7 +163,6 @@ def Parseloghtmltxt(year, expedition, txt): print("can't parse: ", trippara) # this is 2007 which needs editing #assert s, trippara continue - tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() ldate = ParseDate(tripdate.strip(), year) #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) @@ -174,16 +173,18 @@ def Parseloghtmltxt(year, expedition, txt): tripcave = triptitles[0] else: tripcave = "UNKNOWN" - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + #print("\n", tripcave, "--- ppp", trippeople, len(triptext)) ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") if logbook_entry_count == 0: print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") -# main parser for pre-2001. simpler because the data has been hacked so much to fit it +# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: @@ -227,9 +228,11 @@ def Parseloghtml01(year, expedition, txt): #print ldate, trippeople.strip() # could includ the tripid (url link for cross referencing) - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") +# parser for 2003 def Parseloghtml03(year, expedition, txt): tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: @@ -256,7 +259,9 @@ def Parseloghtml03(year, expedition, txt): ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, + text = ltriptext, trippeople=trippeople, expedition=expedition, + logtime_underground=0, entry_type="html") def SetDatesFromLogbookEntries(expedition): @@ -281,8 +286,7 @@ def SetDatesFromLogbookEntries(expedition): def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ - expowebbase = os.path.join(settings.EXPOWEB, "years") - #year = str(expedition.year) + expowebbase = os.path.join(settings.EXPOWEB, "years") yearlinks = settings.LOGBOOK_PARSER_SETTINGS logbook_parseable = False @@ -294,6 +298,7 @@ def LoadLogbookForExpedition(expedition): file_in.close() parsefunc = year_settings[1] logbook_parseable = True + print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]) else: try: file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) @@ -304,7 +309,7 @@ def LoadLogbookForExpedition(expedition): parsefunc = settings.DEFAULT_LOGBOOK_PARSER except (IOError): logbook_parseable = False - print("Couldn't open default logbook file and nothing set for expo " + expedition.year) + print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year) if logbook_parseable: parser = globals()[parsefunc] |