summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py139
1 files changed, 127 insertions, 12 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index e3d9d06..cd95304 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = {
"1990": ("log.htm", "parser_html_01"),
"1989": ("log.htm", "parser_html_01"), #crashes MySQL
"1988": ("log.htm", "parser_html_01"), #crashes MySQL
- "1987": ("log.htm", "parser_html_01"), #crashes MySQL
+ #"1987": ("log.htm", "parser_02"), #crashes MySQL
+ "1987": ("logbook.html", "parser_html"),
"1985": ("logbook.html", "parser_html"),
"1984": ("logbook.html", "parser_html"),
"1983": ("logbook.html", "parser_html"),
@@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015":
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
- "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
+ "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.
@@ -366,10 +367,118 @@ def parser_html(year, expedition, txt, seq=""):
trippeople, expedition, tu, tripid1)
logentries.append(entrytuple)
-
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""):
+ global logentries
+ global logdataissues
+ errorcount = 0
+
+ tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+ logbook_entry_count = 0
+ for trippara in tripparas:
+ logbook_entry_count += 1
+ tid = set_trip_id(year,logbook_entry_count)
+ # print(f" #0 - tid: {tid}")
+ try:
+ #print(f" #1 - tid: {tid}")
+ s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+ if not s:
+ message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ try:
+ tripheader, triptext = s.group(1), s.group(2)
+ except:
+ message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+
+
+ # mtripid = re.search(r'<a id="(.*?)"', tripheader)
+ # if not mtripid:
+ # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
+ # DataIssue.objects.create(parser='logbooks', message=message)
+ # logdataissues[tid]=message
+ # print(message)
+
+ # tripid = mtripid and mtripid.group(1) or ""
+ # print(f" # - mtripid: {mtripid}")
+ tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
+ #print(f" #2 - tid: {tid}")
+ try:
+ tripdate, triptitle, trippeople = tripheader.split("|")
+ except:
+ message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ try:
+ tripdate, triptitle = tripheader.split("|")
+ trippeople = "GUESS ANON"
+ except:
+ message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ #print(f" #3 - tid: {tid}")
+ ldate = ParseDate(tripdate.strip(), year)
+ #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
+ #print(f" #4 - tid: {tid}")
+
+ mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
+ if mtu:
+ tu = mtu.group(1)
+ triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
+ else:
+ tu = ""
+
+ triptitles = triptitle.split(" - ")
+ tripcave = triptitles[0].strip()
+
+ ltriptext = triptext
+
+ mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+ if mtail:
+ ltriptext = ltriptext[:mtail.start(0)]
+ ltriptext = re.sub(r"</p>", "", ltriptext)
+ ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub(r"</?u>", "_", ltriptext)
+ ltriptext = re.sub(r"</?i>", "''", ltriptext)
+ ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+ ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+
+ if ltriptext == "":
+ message = " ! - Zero content for logbook entry!: " + tid
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+
+
+ entrytuple = (ldate, tripcave, triptitle, ltriptext,
+ trippeople, expedition, tu, tid)
+ logentries.append(entrytuple)
+
+ except:
+ message = f" ! - Skipping logentry {year} due to exception in: {tid}"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ errorcount += 1
+ raise
+ if errorcount >5 :
+ message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ return
+
+# variant parser for 1987
+def parser_02(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
@@ -381,7 +490,7 @@ def parser_html_01(year, expedition, txt, seq=""):
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
- # print(f" - headpara:\n'{headpara}'")
+ #print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
@@ -394,22 +503,28 @@ def parser_html_01(year, expedition, txt, seq=""):
tid = set_trip_id(year,logbook_entry_count)
# print(f" #0 - tid: {tid}")
try:
- #print(f" #1 - tid: {tid}")
- s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+ # print(f" #1 - tid: {tid}")
+ #s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+ s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara)
if not s:
- message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
+ message = " ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
+ #print(s.group(2))
+ #print(s.group(3)[:80])
try:
- tripheader, triptext = s.group(1), s.group(2)
+ tripheader, triptext = s.group(2), s.group(3)
except:
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
+ print(f" {tid} {tripheader}")
+ if not tripheader:
+ continue
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
@@ -421,11 +536,11 @@ def parser_html_01(year, expedition, txt, seq=""):
# tripid = mtripid and mtripid.group(1) or ""
# print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
- #print(f" #2 - tid: {tid}")
+ # print(f" #2 - tid: {tid}")
try:
tripdate, triptitle, trippeople = tripheader.split("|")
except:
- message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
+ message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
@@ -438,11 +553,11 @@ def parser_html_01(year, expedition, txt, seq=""):
logdataissues[tid]=message
print(message)
break
- #print(f" #3 - tid: {tid}")
+ # print(f" #3 - tid: {tid}")
triptitle = triptitle.strip()
ldate = ParseDate(tripdate.strip(), year)
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
- #print(f" #4 - tid: {tid}")
+ # print(f" #4 - tid: {tid}")
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu: