diff options
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 112 |
1 files changed, 55 insertions, 57 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 46aba96..d615930 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -58,10 +58,12 @@ trips ={} # # the logbook loading section # -def GetTripPersons(trippeople, expedition, logtime_underground): +def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"): res = [ ] author = None round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]") + if tid =="!": + tid = expedition.year + "." + tripperson for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson) @@ -71,10 +73,10 @@ def GetTripPersons(trippeople, expedition, logtime_underground): tripperson = re.sub(round_bracket_regex, "", tripperson).strip() personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year) + message = f" ! - {expedition.year} No name match for: '{tripperson}' " print(message) DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[expedition.year + "~" + tripperson]=message + logdataissues[tid]=message res.append((personyear, logtime_underground)) if mul: author = personyear @@ -84,37 +86,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground): author = res[-1][0] return res, author -def GetTripCave(place): - try: - katastNumRes=[] - katastNumRes=list(Cave.objects.filter(kataster_number=int(place))) - except ValueError: - message = " ! - ValueError on finding place " + str(place) + " entered. " + tripdate + " - " + year - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues["author"]=message - officialNameRes=list(Cave.objects.filter(official_name=place)) - tripCaveRes=officialNameRes+katastNumRes - - if len(tripCaveRes)==1: - return tripCaveRes[0] - elif len(tripCaveRes)>1: - message = " ! - Ambiguous place " + str(place) + " entered. " + tripdate + " - " + year + " " + str(tripCaveRes) - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues["author"]=message - return tripCaveRes[0] - else: - print((" " , place)) - message = " ! - No cave found for place:" + str(place) + tripdate + " - " + year - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues["author"]=message - return None - - -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"): """ saves a logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? """ - trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) + try: + trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) + except: + message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year) + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues["title"]=message + return + if not author: print(" ! - Skipping logentry: " + title + " - no author for entry") message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) @@ -264,7 +247,8 @@ def Parseloghtmltxt(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 - + tid= "n{}-s{:02d}".format(year,logbook_entry_count) + s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)? @@ -298,7 +282,7 @@ def Parseloghtmltxt(year, expedition, txt): EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html") + entry_type="html", tid=tid) EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, "html", tripid1, logbook_entry_count) @@ -314,13 +298,13 @@ def Parseloghtml01(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 + tid= f"{year}.s{logbook_entry_count:02d}" try: - tripentry = year + "." + str(logbook_entry_count) s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) if not s: - message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..." + message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..." DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tripentry]=message + logdataissues[tid]=message print(message) break tripheader, triptext = s.group(1), s.group(2) @@ -356,23 +340,35 @@ def Parseloghtml01(year, expedition, txt): entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, "html01", tripid) logentries.append(entrytuple) + try: + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, - trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html") - - EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html01", tripid, logbook_entry_count) + except: + message = " ! - Enter log entry into database FAIL exception in: " + tid + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + try: + EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, + "html01", tripid, logbook_entry_count) + except: + message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + except: - message = " ! - Skipping logentry due to exception in: " + tripentry + message = " ! - Skipping logentry due to exception in: " + tid DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tripentry]=message + logdataissues[tid]=message print(message) errorcount += 1 if errorcount >5 : - message = " !!- TOO MANY ERRORS - aborting logbook: " + year + message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tripentry]=message + logdataissues[tid]=message print(message) return @@ -385,12 +381,13 @@ def Parseloghtml03(year, expedition, txt): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 + tid= f"{year}.s{logbook_entry_count:02d}" s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara) if not ( s ) : - message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tripentry,s,trippara[:300]) + message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300]) DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tripentry]=message + logdataissues[tid]=message print(message) break @@ -415,7 +412,6 @@ def Parseloghtml03(year, expedition, txt): ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - tid= "n{}-s{:02d}".format(str(ldate),logbook_entry_count) entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, "html03", tid) @@ -477,21 +473,23 @@ def LoadLogbookForExpedition(expedition, expect): def cleanerrors(year): global logdataissues - print(f' - CLEAN {year} {len(logdataissues)} data issues in total') dataissues = DataIssue.objects.filter(parser='logbooks') for di in dataissues: - ph = "t" + year + "-" + ph = year if re.search(ph, di.message) is not None: print(f' - CLEANING dataissue {di.message}') di.delete() - for te, content in logdataissues: + print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year') + dellist = [] + for key, value in logdataissues.items(): # tripentry = year + "." + str(logbook_entry_count) - print(f' - CLEAN {te}') - if te.startswith(year + "."): - print(f' - CLEANING logdataissue {te}') - logdataissues.pop(te) - + print(f' - CLEAN [{key}]') + if key.startswith(year + "."): + print(f' - CLEANING logdataissues [{key:12}]: value ') + dellist.append(key) + for i in dellist: + del logdataissues[i] cleanerrors(expedition.year) |