diff options
author | Philip Sargent <philip.sargent@klebos.com> | 2021-02-06 00:18:48 +0000 |
---|---|---|
committer | Philip Sargent <philip.sargent@klebos.com> | 2021-02-06 00:18:48 +0000 |
commit | 5836c6ff9080285e48662b28da7286a877b0e813 (patch) | |
tree | 4b5c8859bd530c7828814cdebd9127d924bfd21d /parsers/logbooks.py | |
parent | a4d7183260d80d8c3d93fb87bbc54c787e171d64 (diff) | |
download | troggle-5836c6ff9080285e48662b28da7286a877b0e813.tar.gz troggle-5836c6ff9080285e48662b28da7286a877b0e813.tar.bz2 troggle-5836c6ff9080285e48662b28da7286a877b0e813.zip |
Importing old logbooks
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 129 |
1 files changed, 77 insertions, 52 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index a8f0cca..9e8fc8f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -37,9 +37,10 @@ def GetTripPersons(trippeople, expedition, logtime_underground): tripperson = re.sub(round_bracket_regex, "", tripperson).strip() personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - print((" - No name match for: '%s'" % tripperson)) - message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year) + message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year) + print(message) DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[expedition.year + "~" + tripperson]=message res.append((personyear, logtime_underground)) if mul: author = personyear @@ -91,6 +92,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ print(" ! - Skipping logentry: " + title + " - no author for entry") message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) DataIssue.objects.create(parser='logbooks', message=message) + logdataissues["title"]=message return # This needs attention. The slug field is derived from 'title' @@ -133,7 +135,7 @@ def ParseDate(tripdate, year): else: message = " ! - Bad date in logbook: " + tripdate + " - " + year DataIssue.objects.create(parser='logbooks', message=message) - logdataissues["author"]=message + logdataissues["tripdate"]=message assert False, tripdate return datetime.date(year, month, day) @@ -254,57 +256,77 @@ def Parseloghtmltxt(year, expedition, txt): "html", tripid1, logbook_entry_count) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it +# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. def Parseloghtml01(year, expedition, txt): global logentries global logdataissues + errorcount = 0 tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 + try: + tripentry = year + "." + str(logbook_entry_count) + s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) + if not s: + message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..." + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tripentry]=message + print(message) + break + tripheader, triptext = s.group(1), s.group(2) + mtripid = re.search(r'<a id="(.*?)"', tripheader) + tripid = mtripid and mtripid.group(1) or "" + tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) + + tripdate, triptitle, trippeople = tripheader.split("|") + ldate = ParseDate(tripdate.strip(), year) - s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) - assert s, trippara[:300] - tripheader, triptext = s.group(1), s.group(2) - mtripid = re.search(r'<a id="(.*?)"', tripheader) - tripid = mtripid and mtripid.group(1) or "" - tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) - - tripdate, triptitle, trippeople = tripheader.split("|") - ldate = ParseDate(tripdate.strip(), year) - - mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - - ltriptext = triptext - - mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) - if mtail: - ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub(r"</p>", "", ltriptext) - ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip() - ltriptext = re.sub(r"</?u>", "_", ltriptext) - ltriptext = re.sub(r"</?i>", "''", ltriptext) - ltriptext = re.sub(r"</?b>", "'''", ltriptext) - - entrytuple = (ldate, tripcave, triptitle, ltriptext, - trippeople, expedition, tu, "html01", tripid) - logentries.append(entrytuple) - - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, - trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html") - - EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html01", tripid, logbook_entry_count) + mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) + if mtu: + tu = mtu.group(1) + triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] + else: + tu = "" + + triptitles = triptitle.split(" - ") + tripcave = triptitles[0].strip() + + ltriptext = triptext + + mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) + if mtail: + ltriptext = ltriptext[:mtail.start(0)] + ltriptext = re.sub(r"</p>", "", ltriptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"</?u>", "_", ltriptext) + ltriptext = re.sub(r"</?i>", "''", ltriptext) + ltriptext = re.sub(r"</?b>", "'''", ltriptext) + + entrytuple = (ldate, tripcave, triptitle, ltriptext, + trippeople, expedition, tu, "html01", tripid) + logentries.append(entrytuple) + + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") + + EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, + "html01", tripid, logbook_entry_count) + except: + message = " ! - Skipping logentry due to exception in: " + tripentry + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tripentry]=message + print(message) + errorcount += 1 + if errorcount >5 : + message = " !!- TOO MANY ERRORS - aborting logbook: " + year + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tripentry]=message + print(message) + return # parser for 2003 def Parseloghtml03(year, expedition, txt): @@ -473,6 +495,8 @@ def LoadLogbookForExpedition(expedition,expect): def LoadLogbooks(): """ This is the master function for parsing all logbooks into the Troggle database. + Parser settings appropriate for each year are set in settings.py LOGBOOK_PARSER_SETTINGS. + This should be rewritten to use coroutines to load all logbooks from disc in parallel. """ global logdataissues @@ -481,13 +505,14 @@ def LoadLogbooks(): expos = Expedition.objects.all() if len(expos) <= 1: print(" ! No expeditions found. Load 'people' first.\n") - nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984", - "1985","1986","1987","1988","1989","1990",] - entries = {"2020": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, + nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", "1986", "2020",] + entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, - "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1982": 0} + "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, + "1985": 1,"1984": 1,"1983": 1,"1982": 42,} + # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. try: os.remove("loadlogbk.log") except OSError: @@ -503,8 +528,8 @@ def LoadLogbooks(): nlbe[expo.year]=numentries expd[expo.year]= 0 print("** total trips in ObjStore:", len(trips)) - for i in logdataissues: - print("{:15s}: {}".format(i, logdataissues[i])) + #for i in logdataissues: + # print("{:15s}: {}".format(i, logdataissues[i])) for lbe in trips: year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe] @@ -513,7 +538,7 @@ def LoadLogbooks(): for y in expd: print("{} {}".format(y, expd[y]), nlbe[y]) yt += expd[y] - print("{} total".format(yt)) + print("total {} log entries in all expeditions".format(yt)) with shelve.open('logbktrips.shelve',writeback=True) as odb: for lbe in trips: |