diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/imports.py | 2 | ||||
-rw-r--r-- | parsers/logbooks.py | 139 |
2 files changed, 128 insertions, 13 deletions
diff --git a/parsers/imports.py b/parsers/imports.py index e445a2a..2f318b8 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,7 +41,7 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=1982): +def import_logbook(year=1987): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index e3d9d06..cd95304 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = { "1990": ("log.htm", "parser_html_01"), "1989": ("log.htm", "parser_html_01"), #crashes MySQL "1988": ("log.htm", "parser_html_01"), #crashes MySQL - "1987": ("log.htm", "parser_html_01"), #crashes MySQL + #"1987": ("log.htm", "parser_02"), #crashes MySQL + "1987": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"), "1984": ("logbook.html", "parser_html"), "1983": ("logbook.html", "parser_html"), @@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, - "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1, + "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34, "1985": 24, "1984": 32, "1983": 52, "1982": 42,} # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. @@ -366,10 +367,118 @@ def parser_html(year, expedition, txt, seq=""): trippeople, expedition, tu, tripid1) logentries.append(entrytuple) - # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def parser_html_01(year, expedition, txt, seq=""): + global logentries + global logdataissues + errorcount = 0 + + tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) + logbook_entry_count = 0 + for trippara in tripparas: + logbook_entry_count += 1 + tid = set_trip_id(year,logbook_entry_count) + # print(f" #0 - tid: {tid}") + try: + #print(f" #1 - tid: {tid}") + s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara) + if not s: + message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + break + try: + tripheader, triptext = s.group(1), s.group(2) + except: + message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + + + # mtripid = re.search(r'<a id="(.*?)"', tripheader) + # if not mtripid: + # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'" + # DataIssue.objects.create(parser='logbooks', message=message) + # logdataissues[tid]=message + # print(message) + + # tripid = mtripid and mtripid.group(1) or "" + # print(f" # - mtripid: {mtripid}") + tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) + #print(f" #2 - tid: {tid}") + try: + tripdate, triptitle, trippeople = tripheader.split("|") + except: + message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + try: + tripdate, triptitle = tripheader.split("|") + trippeople = "GUESS ANON" + except: + message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + break + #print(f" #3 - tid: {tid}") + ldate = ParseDate(tripdate.strip(), year) + #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") + #print(f" #4 - tid: {tid}") + + mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) + if mtu: + tu = mtu.group(1) + triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] + else: + tu = "" + + triptitles = triptitle.split(" - ") + tripcave = triptitles[0].strip() + + ltriptext = triptext + + mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) + if mtail: + ltriptext = ltriptext[:mtail.start(0)] + ltriptext = re.sub(r"</p>", "", ltriptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"</?u>", "_", ltriptext) + ltriptext = re.sub(r"</?i>", "''", ltriptext) + ltriptext = re.sub(r"</?b>", "'''", ltriptext) + ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip() + + if ltriptext == "": + message = " ! - Zero content for logbook entry!: " + tid + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + + + entrytuple = (ldate, tripcave, triptitle, ltriptext, + trippeople, expedition, tu, tid) + logentries.append(entrytuple) + + except: + message = f" ! - Skipping logentry {year} due to exception in: {tid}" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + errorcount += 1 + raise + if errorcount >5 : + message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + return + +# variant parser for 1987 +def parser_02(year, expedition, txt, seq=""): '''This uses some of the more obscure capabilities of regular expressions, see https://docs.python.org/3/library/re.html ''' @@ -381,7 +490,7 @@ def parser_html_01(year, expedition, txt, seq=""): headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt) headpara = headmatch.groups()[0].strip() - # print(f" - headpara:\n'{headpara}'") + #print(f" - headpara:\n'{headpara}'") if(len(headpara)>0): frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") with open(frontpath,"w") as front: @@ -394,22 +503,28 @@ def parser_html_01(year, expedition, txt, seq=""): tid = set_trip_id(year,logbook_entry_count) # print(f" #0 - tid: {tid}") try: - #print(f" #1 - tid: {tid}") - s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara) + # print(f" #1 - tid: {tid}") + #s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara) + s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara) if not s: - message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." + message = " ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..." DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) break + #print(s.group(2)) + #print(s.group(3)[:80]) try: - tripheader, triptext = s.group(1), s.group(2) + tripheader, triptext = s.group(2), s.group(3) except: message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) + print(f" {tid} {tripheader}") + if not tripheader: + continue # mtripid = re.search(r'<a id="(.*?)"', tripheader) # if not mtripid: @@ -421,11 +536,11 @@ def parser_html_01(year, expedition, txt, seq=""): # tripid = mtripid and mtripid.group(1) or "" # print(f" # - mtripid: {mtripid}") tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) - #print(f" #2 - tid: {tid}") + # print(f" #2 - tid: {tid}") try: tripdate, triptitle, trippeople = tripheader.split("|") except: - message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" + message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) @@ -438,11 +553,11 @@ def parser_html_01(year, expedition, txt, seq=""): logdataissues[tid]=message print(message) break - #print(f" #3 - tid: {tid}") + # print(f" #3 - tid: {tid}") triptitle = triptitle.strip() ldate = ParseDate(tripdate.strip(), year) #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") - #print(f" #4 - tid: {tid}") + # print(f" #4 - tid: {tid}") mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) if mtu: |