summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'parsers')
-rw-r--r--parsers/logbooks.py112
1 files changed, 55 insertions, 57 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 46aba96..d615930 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -58,10 +58,12 @@ trips ={}
#
# the logbook loading section
#
-def GetTripPersons(trippeople, expedition, logtime_underground):
+def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
res = [ ]
author = None
round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
+ if tid =="!":
+ tid = expedition.year + "." + tripperson
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
@@ -71,10 +73,10 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
- message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year)
+ message = f" ! - {expedition.year} No name match for: '{tripperson}' "
print(message)
DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[expedition.year + "~" + tripperson]=message
+ logdataissues[tid]=message
res.append((personyear, logtime_underground))
if mul:
author = personyear
@@ -84,37 +86,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
author = res[-1][0]
return res, author
-def GetTripCave(place):
- try:
- katastNumRes=[]
- katastNumRes=list(Cave.objects.filter(kataster_number=int(place)))
- except ValueError:
- message = " ! - ValueError on finding place " + str(place) + " entered. " + tripdate + " - " + year
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues["author"]=message
- officialNameRes=list(Cave.objects.filter(official_name=place))
- tripCaveRes=officialNameRes+katastNumRes
-
- if len(tripCaveRes)==1:
- return tripCaveRes[0]
- elif len(tripCaveRes)>1:
- message = " ! - Ambiguous place " + str(place) + " entered. " + tripdate + " - " + year + " " + str(tripCaveRes)
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues["author"]=message
- return tripCaveRes[0]
- else:
- print((" " , place))
- message = " ! - No cave found for place:" + str(place) + tripdate + " - " + year
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues["author"]=message
- return None
-
-
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why?
"""
- trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
+ try:
+ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
+ except:
+ message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year)
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues["title"]=message
+ return
+
if not author:
print(" ! - Skipping logentry: " + title + " - no author for entry")
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
@@ -264,7 +247,8 @@ def Parseloghtmltxt(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
-
+ tid= "n{}-s{:02d}".format(year,logbook_entry_count)
+
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
@@ -298,7 +282,7 @@ def Parseloghtmltxt(year, expedition, txt):
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
- entry_type="html")
+ entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html", tripid1, logbook_entry_count)
@@ -314,13 +298,13 @@ def Parseloghtml01(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
+ tid= f"{year}.s{logbook_entry_count:02d}"
try:
- tripentry = year + "." + str(logbook_entry_count)
s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
if not s:
- message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..."
+ message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tripentry]=message
+ logdataissues[tid]=message
print(message)
break
tripheader, triptext = s.group(1), s.group(2)
@@ -356,23 +340,35 @@ def Parseloghtml01(year, expedition, txt):
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
logentries.append(entrytuple)
+ try:
+ EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
+ trippeople=trippeople, expedition=expedition, logtime_underground=0,
+ entry_type="html")
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
- trippeople=trippeople, expedition=expedition, logtime_underground=0,
- entry_type="html")
-
- EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html01", tripid, logbook_entry_count)
+ except:
+ message = " ! - Enter log entry into database FAIL exception in: " + tid
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ try:
+ EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
+ "html01", tripid, logbook_entry_count)
+ except:
+ message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+
except:
- message = " ! - Skipping logentry due to exception in: " + tripentry
+ message = " ! - Skipping logentry due to exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tripentry]=message
+ logdataissues[tid]=message
print(message)
errorcount += 1
if errorcount >5 :
- message = " !!- TOO MANY ERRORS - aborting logbook: " + year
+ message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tripentry]=message
+ logdataissues[tid]=message
print(message)
return
@@ -385,12 +381,13 @@ def Parseloghtml03(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
+ tid= f"{year}.s{logbook_entry_count:02d}"
s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
if not ( s ) :
- message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tripentry,s,trippara[:300])
+ message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tripentry]=message
+ logdataissues[tid]=message
print(message)
break
@@ -415,7 +412,6 @@ def Parseloghtml03(year, expedition, txt):
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
- tid= "n{}-s{:02d}".format(str(ldate),logbook_entry_count)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html03", tid)
@@ -477,21 +473,23 @@ def LoadLogbookForExpedition(expedition, expect):
def cleanerrors(year):
global logdataissues
- print(f' - CLEAN {year} {len(logdataissues)} data issues in total')
dataissues = DataIssue.objects.filter(parser='logbooks')
for di in dataissues:
- ph = "t" + year + "-"
+ ph = year
if re.search(ph, di.message) is not None:
print(f' - CLEANING dataissue {di.message}')
di.delete()
- for te, content in logdataissues:
+ print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
+ dellist = []
+ for key, value in logdataissues.items():
# tripentry = year + "." + str(logbook_entry_count)
- print(f' - CLEAN {te}')
- if te.startswith(year + "."):
- print(f' - CLEANING logdataissue {te}')
- logdataissues.pop(te)
-
+ print(f' - CLEAN [{key}]')
+ if key.startswith(year + "."):
+ print(f' - CLEANING logdataissues [{key:12}]: value ')
+ dellist.append(key)
+ for i in dellist:
+ del logdataissues[i]
cleanerrors(expedition.year)