diff options
-rw-r--r-- | core/utils.py | 4 | ||||
-rw-r--r-- | core/views/logbooks.py | 10 | ||||
-rw-r--r-- | parsers/logbooks.py | 273 |
3 files changed, 109 insertions, 178 deletions
diff --git a/core/utils.py b/core/utils.py index fe85533..bf7cb28 100644 --- a/core/utils.py +++ b/core/utils.py @@ -36,7 +36,11 @@ save_carefully() - core function that saves troggle objects in the database TROG = { 'pagecache' : { 'expedition' : {} + }, + 'issues' : { + 'logdataissues' : {} } + } # This is module-level executable. This is a Bad Thing. Especially when it touches the file system. diff --git a/core/views/logbooks.py b/core/views/logbooks.py index cb32bab..ecf0f6b 100644 --- a/core/views/logbooks.py +++ b/core/views/logbooks.py @@ -59,16 +59,16 @@ def expedition(request, expeditionname): if request.user.is_authenticated: if "reload" in request.GET: this_expedition = Expedition.objects.get(year=int(expeditionname)) - # Need to delete the exisitng entries or we get duplication + # Need to delete the existing entries or we get duplication # Need to delete both in the Django ORM and in our own object-store. entries = this_expedition.logbookentry_set.all() - print(f'! - expo {expeditionname} {len(entries)} entries') + print(f'! - expo {expeditionname} {len(entries)} entries initially') for entry in entries: - print(f'! - delete entry: "{entry}"') + #print(f'! - delete entry: "{entry}"') entry.delete() entries = this_expedition.logbookentry_set.all() - print(f'! - expo {expeditionname} {len(entries)} entries') - LoadLogbookForExpedition(this_expedition, 0) # 0 means re-parse + print(f'! - expo {expeditionname} {len(entries)} entries after deletion') + LoadLogbookForExpedition(this_expedition, 0) # 0 means re-parse as implies cache expected to be 0 logged_in = True else: logged_in = False diff --git a/parsers/logbooks.py b/parsers/logbooks.py index d615930..6a156af 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,8 +1,8 @@ import csv -import datetime +from datetime import datetime, date, time import os import re -import time +#import time import pickle import shelve @@ -26,8 +26,6 @@ todo=''' - refactor everything with some urgency, esp. LoadLogbookForExpedition() -- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does - - Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or it is broken/incomplete and need hand-editing. @@ -46,26 +44,35 @@ todo=''' - We should ensure logbook.html is utf-8 and stop this crap: file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") + +- this is a slow and uncertain function: cave = getCaveByReference(caveRef) ''' logentries = [] # the entire logbook for one year is a single object: a list of entries noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 'base camp', 'basecamp', 'top camp', 'topcamp' ] -logdataissues = {} +logdataissues = TROG['issues']['logdataissues'] trips ={} # # the logbook loading section # -def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"): +def set_trip_id(year, seq): + tid= f"{year}.s{seq:02d}" + return tid + + +def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): res = [ ] author = None round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]") - if tid =="!": - tid = expedition.year + "." + tripperson + #print(f'# {tid}') + for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() + if not tid: + tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson) if mul: tripperson = mul.group(1).strip() @@ -86,23 +93,24 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"): author = res[-1][0] return res, author -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None): """ saves a logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? """ try: trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) except: - message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year) + message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message + print(message) return if not author: - print(" ! - Skipping logentry: " + title + " - no author for entry") - message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) + message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year " DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message + print(message) return # This needs attention. The slug field is derived from 'title' @@ -113,16 +121,16 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ cave=None if lplace not in noncaveplaces: cave = GetCaveLookup().get(lplace) - # message = " ! - '" + lplace + "' place not in noncaveplaces." - # print(message) - # DataIssue.objects.create(parser='logbooks', message=message) #Check for an existing copy of the current entry, and save expeditionday = expedition.get_expedition_day(date) lookupAttribs={'date':date, 'title':title} # 'cave' is converted to a string doing this, which renders as the cave slug. # but it is a db query which we should try to avoid - rewrite this - nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type} + + #NEW sluf for a logbook entry here! Use the unique id, not the title !!! + slug = tid + slugify(title)[:50] + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type} lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) @@ -133,31 +141,37 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ def ParseDate(tripdate, year): """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ - mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) - mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) - if mdatestandard: - if not (mdatestandard.group(1) == year): - message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues["tripdate"]=message - return datetime.date('1970', '01', '01') + try: + mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) + mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) + if mdatestandard: + if not (mdatestandard.group(1) == year): + message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues["tripdate"]=message + return datetime.date('1970', '01', '01') + else: + year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) + elif mdategoof: + if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]): + message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3) + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues["tripdate"]=message + return date('1970', '01', '01') + else: + yadd = int(year[:2]) * 100 + day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd else: - year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) - elif mdategoof: - if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]): - message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3) + message = " ! - Bad date in logbook: " + tripdate + " - " + year DataIssue.objects.create(parser='logbooks', message=message) logdataissues["tripdate"]=message - return datetime.date('1970', '01', '01') - else: - yadd = int(year[:2]) * 100 - day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd - else: - message = " ! - Bad date in logbook: " + tripdate + " - " + year + + return date(year, month, day) + except: + message = " ! - Failed to parse date in logbook: " + tripdate + " - " + year DataIssue.objects.create(parser='logbooks', message=message) logdataissues["tripdate"]=message - - return datetime.date(year, month, day) + return date(year, month, day) # (2006 - not any more), 2008 - 2009 def Parselogwikitxt(year, expedition, txt): @@ -168,6 +182,8 @@ def Parselogwikitxt(year, expedition, txt): trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: logbook_entry_count += 1 + tid = set_trip_id(year,logbook_entry_count) + tripheadp = triphead.split("|") if not (len(tripheadp) == 3): message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp @@ -198,30 +214,24 @@ def Parselogwikitxt(year, expedition, txt): logentries.append(entrytuple) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, - expedition=expedition, logtime_underground=0) + expedition=expedition, logtime_underground=0, tid=tid) EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, - tu, "wiki", tripid, logbook_entry_count) + tu, "wiki", tripid, logbook_entry_count, tid=tid) -def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq): +def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None): # This will need additional functions to replicate the persontrip calculation and storage. For the # moment we leave all that to be done in the django db global trips # should be a singleton TROG eventually global logdataissues - if tripid1 is None or tripid1 =="": - tid= "n{}-s{:02d}".format(str(date),seq) - #print(" - New id ",tid) - else: - tid= tripid1 - if tid in trips: tyear, tdate, *trest = trips[tid] msg = f" ! DUPLICATE on {tdate} id: '{tid}'" print(msg) DataIssue.objects.create(parser='logbooks', message=msg) - tid= "d{}-s{:02d}".format(str(date),seq) + tid = set_trip_id(str(date),seq) #print(" - De-dup ",seq, tid) logdataissues[tid]=msg trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype) @@ -247,7 +257,7 @@ def Parseloghtmltxt(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 - tid= "n{}-s{:02d}".format(year,logbook_entry_count) + tid = set_trip_id(year,logbook_entry_count) s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? @@ -285,7 +295,7 @@ def Parseloghtmltxt(year, expedition, txt): entry_type="html", tid=tid) EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html", tripid1, logbook_entry_count) + "html", tripid1, logbook_entry_count, tid=tid) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. @@ -298,18 +308,26 @@ def Parseloghtml01(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 - tid= f"{year}.s{logbook_entry_count:02d}" + tid = set_trip_id(year,logbook_entry_count) try: s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) if not s: - message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..." + message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) break tripheader, triptext = s.group(1), s.group(2) mtripid = re.search(r'<a id="(.*?)"', tripheader) + # if not mtripid: + # # not an error, this is probabluy jusyt a different year + # message = f" ! - Fail id trip:{tid} header:'{tripheader}'" + # DataIssue.objects.create(parser='logbooks', message=message) + # logdataissues[tid]=message + # print(message) + tripid = mtripid and mtripid.group(1) or "" + #print(f" # - mtripid: {mtripid}") tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) tripdate, triptitle, trippeople = tripheader.split("|") @@ -336,6 +354,13 @@ def Parseloghtml01(year, expedition, txt): ltriptext = re.sub(r"</?u>", "_", ltriptext) ltriptext = re.sub(r"</?i>", "''", ltriptext) ltriptext = re.sub(r"</?b>", "'''", ltriptext) + + if ltriptext == "": + message = " ! - Zero content for logbook entry!: " + tid + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, "html01", tripid) @@ -343,16 +368,16 @@ def Parseloghtml01(year, expedition, txt): try: EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html") - + entry_type="html", tid=tid) except: message = " ! - Enter log entry into database FAIL exception in: " + tid DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) + try: EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html01", tripid, logbook_entry_count) + "html01", tripid, logbook_entry_count, tid=tid) except: message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid DataIssue.objects.create(parser='logbooks', message=message) @@ -360,7 +385,7 @@ def Parseloghtml01(year, expedition, txt): print(message) except: - message = " ! - Skipping logentry due to exception in: " + tid + message = f" ! - Skipping logentry {year} due to exception in: {tid}" DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) @@ -381,11 +406,11 @@ def Parseloghtml03(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 - tid= f"{year}.s{logbook_entry_count:02d}" + tid = set_trip_id(year,logbook_entry_count) s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara) if not ( s ) : - message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300]) + message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300]) DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) @@ -419,10 +444,10 @@ def Parseloghtml03(year, expedition, txt): EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, - logtime_underground=0, entry_type="html") + logtime_underground=0, entry_type="html", tid=tid) EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html03", tid, logbook_entry_count) + "html03", tid, logbook_entry_count, tid=tid) def SetDatesFromLogbookEntries(expedition): @@ -477,16 +502,15 @@ def LoadLogbookForExpedition(expedition, expect): for di in dataissues: ph = year if re.search(ph, di.message) is not None: - print(f' - CLEANING dataissue {di.message}') + #print(f' - CLEANING dataissue {di.message}') di.delete() - print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year') + #print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year') dellist = [] for key, value in logdataissues.items(): - # tripentry = year + "." + str(logbook_entry_count) - print(f' - CLEAN [{key}]') - if key.startswith(year + "."): - print(f' - CLEANING logdataissues [{key:12}]: value ') + #print(f' - CLEANING logdataissues [{key}]: {value}') + if key.startswith(year): + #print(f' - CLEANING logdataissues [{key:12}]: {value} ') dellist.append(key) for i in dellist: del logdataissues[i] @@ -547,7 +571,9 @@ def LoadLogbookForExpedition(expedition, expect): if logbook_parseable: parser = globals()[parsefunc] - parser(expedition.year, expedition, txt) + + parser(expedition.year, expedition, txt) # this launches the parser + SetDatesFromLogbookEntries(expedition) if len(logentries) >0: print(" - Cacheing " , len(logentries), " log entries") @@ -625,112 +651,13 @@ def LoadLogbooks(): odb.sync() odb.close() -dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) -expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) -titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S) -reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S) -personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S) -nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S) -TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) -locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S) -caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S) - -def parseAutoLogBookEntry(filename): - '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip, - which is then stored in a separate location to the usual logbook.html - But when importing logbook.html all these individual entries also need ot be parsed. - - This is all redundant as we are getting rid of the whole individual trip entry system - ''' - errors = [] - f = open(filename, "r") - contents = f.read() - f.close() - - dateMatch = dateRegex.search(contents) - if dateMatch: - year, month, day = [int(x) for x in dateMatch.groups()] - date = datetime.date(year, month, day) - else: - errors.append(" - Date could not be found") - - expeditionYearMatch = expeditionYearRegex.search(contents) - if expeditionYearMatch: - try: - expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0]) - personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) - except Expedition.DoesNotExist: - errors.append(" - Expedition not in database") - else: - errors.append(" - Expedition Year could not be parsed") - - titleMatch = titleRegex.search(contents) - if titleMatch: - title, = titleMatch.groups() - if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: - errors.append(" - Title too long") - else: - errors.append(" - Title could not be found") - - caveMatch = caveRegex.search(contents) - if caveMatch: - caveRef, = caveMatch.groups() - try: - # this is a slow and uncertain function: - cave = getCaveByReference(caveRef) - except: - cave = None - errors.append(" - Cave not found in database") - else: - cave = None - - locationMatch = locationRegex.search(contents) - if locationMatch: - location, = locationMatch.groups() - else: - location = None - - if cave is None and location is None: - errors.append(" - Location nor cave could not be found") +# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) +# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) +# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S) +# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S) +# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S) +# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S) +# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) +# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S) +# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S) - reportMatch = reportRegex.search(contents) - if reportMatch: - report, = reportMatch.groups() - else: - errors.append(" - Contents could not be found") - if errors: - return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. - people = [] - for personMatch in personRegex.findall(contents): - nameAuthorMatch = nameAuthorRegex.search(contents) - if nameAuthorMatch: - author, name = nameAuthorMatch.groups() - if name.lower() in personExpeditionNameLookup: - personExpo = personExpeditionNameLookup[name.lower()] - else: - errors.append(" - Person could not be found in database") - author = bool(author) - else: - errors.append(" - Persons name could not be found") - - TUMatch = TURegex.search(contents) - if TUMatch: - TU, = TUMatch.groups() - else: - errors.append(" - TU could not be found") - if not errors: - people.append((name, author, TU)) - if errors: - return errors # Bail out before committing to the database - logbookEntry = LogbookEntry(date = date, - expedition = expedition, - title = title, cave = cave, place = location, - text = report, slug = slugify(title)[:50], - filename = filename) - logbookEntry.save() - for name, author, TU in people: - PersonTrip(personexpedition = personExpo, - time_underground = TU, - logbook_entry = logbookEntry, - is_logbook_entry_author = author).save() - print(logbookEntry)
\ No newline at end of file |