From cb50528e2d6bc7a215a8b55b46a9859aae7f4f83 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Wed, 14 Dec 2022 23:46:14 +0000 Subject: exptl parse UK Caving blog --- parsers/logbooks.py | 132 +++++++++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 57 deletions(-) (limited to 'parsers/logbooks.py') diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 0bbc23d..7e2870b 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html" # All years since 2010 use the default value for Logbook parser # but several don't work, and are skipped by the parsing code, e.g. 1983 LOGBOOK_PARSER_SETTINGS = { + "2019": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"), "2009": ("2009logbook.txt", "wiki_parser"), "2008": ("2008logbook.txt", "wiki_parser"), @@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ return if not author: - message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'" + message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message print(message) @@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt): print(message) return -# parser for 2003. Retired after conversion of the logbook.html -# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser -# def parser_html_03(year, expedition, txt): - # global logentries - # global logdataissues - - # tripparas = re.findall(r"([\s\S]*?)(?=(.*?)

(.*)$", trippara) - # if not ( s ) : - # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300]) - # DataIssue.objects.create(parser='logbooks', message=message) - # logdataissues[tid]=message - # print(message) - # break - - # tripheader, triptext = s.group(1), s.group(2) - # tripheader = re.sub(r" ", " ", tripheader) - # tripheader = re.sub(r"\s+", " ", tripheader).strip() - # sheader = tripheader.split(" -- ") - # tu = "" - # if re.match("T/U|Time underwater", sheader[-1]): - # tu = sheader.pop() # not a number in 2003 usually - # # print(f" - {logbook_entry_count} '{tu}' ") - # if len(sheader) != 3: - # print(" ! Header not three pieces for parser_html_03() ", sheader) - # tripdate, triptitle, trippeople = sheader - # ldate = ParseDate(tripdate.strip(), year) - # # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ") - # # print(f" - {logbook_entry_count} '{trippeople}' ") - # titlelist = triptitle.split(" , ") - # if len(titlelist) >= 2: - # location, *namelist = titlelist # list unpacking operator - # tripname = ", ".join(namelist) # concatenate strings - # # print(f" - {logbook_entry_count} {location} '{tripname}'") - # else: - # location = "UNKNOWN" - - # ltriptext = triptext + "

\n\n" + tu - # ltriptext = re.sub(r"

", "", ltriptext) - # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - # ltriptext = re.sub(r"

", "

\n\n", ltriptext).strip() - # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) +def parser_blog(year, expedition, txt): + '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website. + Note that the entries have dates and authors, but no titles. + ''' + global logentries + global logdataissues + errorcount = 0 + tripheads = re.findall(r"

\s*([\s\S]*?)(?=