summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py294
1 files changed, 134 insertions, 160 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 0ab902a..5ef125e 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -26,12 +26,6 @@ todo = """
- far too many uses of Django field dereferencing to get values, which is SLOW
-- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
-
-- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
- we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
- volume of code here substantially.
-
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
- We should ensure logbook.html is utf-8 and stop this crap:
@@ -46,35 +40,16 @@ data for old logbooks? Not worth it..
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = {
- # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
- # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
- # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
- # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+ # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+ # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+ # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+ # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
}
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
-# All years since 2002 use the default value for Logbook parser
-# dont forget to update expoweb/pubs.htm to match.
+# All years now (Jan.2023) use the default value for Logbook parser
+# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
LOGBOOK_PARSER_SETTINGS = {
- "2002": ("logbook.html", "parser_html"),
- "2001": ("log.htm", "parser_html_01"),
- "2000": ("log.htm", "parser_html_01"),
- "1999": ("log.htm", "parser_html_01"),
- "1998": ("log.htm", "parser_html_01"),
- "1997": ("log.htm", "parser_html_01"),
- "1996": ("log.htm", "parser_html_01"),
- "1995": ("log.htm", "parser_html_01"),
- "1994": ("logbook.html", "parser_html"),
- "1993": ("logbook.html", "parser_html"),
- "1992": ("logbook.html", "parser_html"),
- "1991": ("logbook.html", "parser_html"),
- "1990": ("logbook.html", "parser_html"),
- "1989": ("logbook.html", "parser_html"),
- "1988": ("logbook.html", "parser_html"),
- "1987": ("logbook.html", "parser_html"),
- "1985": ("logbook.html", "parser_html"),
- "1984": ("logbook.html", "parser_html"),
- "1983": ("logbook.html", "parser_html"),
"1982": ("logbook.html", "parser_html"),
}
@@ -325,13 +300,12 @@ def ParseDate(tripdate, year):
return datetime.date(1970, 1, 1)
-# 2002 - now
def parser_html(year, expedition, txt, seq=""):
"""This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
You can't see it here, but a round-trip export-then-import will move
- the endmatter up to the frontmatter. This makes sense when moving
+ the endmatter up to the frontmatter. This made sense when translating
from parser_html_01 format logfiles, believe me.
"""
global logentries
@@ -422,134 +396,134 @@ def parser_html(year, expedition, txt, seq=""):
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
-def parser_html_01(year, expedition, txt, seq=""):
- global logentries
- global logdataissues
- errorcount = 0
-
- # extract front material and stash for later use when rebuilding from list of entries
- headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
- headpara = headmatch.groups()[0].strip()
-
- # print(f" - headpara:\n'{headpara}'")
- if len(headpara) > 0:
- frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
- with open(frontpath, "w") as front:
- front.write(headpara + "\n")
-
- # extract END material and stash for later use when rebuilding from list of entries
- endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
- if endmatch:
- endpara = endmatch.groups()[0].strip()
- else:
- print(f" ! - {year} NO endmatch")
- endpara = ""
-
- # print(f" - endpara:\n'{endpara}'")
- if len(endpara) > 0:
- endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
- with open(endpath, "w") as end:
- end.write(endpara + "\n")
-
- tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
- logbook_entry_count = 0
- for trippara in tripparas:
- logbook_entry_count += 1
- tid = set_trip_id(year, logbook_entry_count)
- # print(f" #0 - tid: {tid}")
- try:
- # print(f" #1 - tid: {tid}")
- s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
- if not s:
- message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
- break
- try:
- tripheader, triptext = s.group(1), s.group(2)
- except:
- message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
-
- # mtripid = re.search(r'<a id="(.*?)"', tripheader)
- # if not mtripid:
- # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
- # DataIssue.objects.create(parser='logbooks', message=message)
- # logdataissues[tid]=message
+# def parser_html_01(year, expedition, txt, seq=""):
+ # global logentries
+ # global logdataissues
+ # errorcount = 0
+
+ # # extract front material and stash for later use when rebuilding from list of entries
+ # headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
+ # headpara = headmatch.groups()[0].strip()
+
+ # # print(f" - headpara:\n'{headpara}'")
+ # if len(headpara) > 0:
+ # frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
+ # with open(frontpath, "w") as front:
+ # front.write(headpara + "\n")
+
+ # # extract END material and stash for later use when rebuilding from list of entries
+ # endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
+ # if endmatch:
+ # endpara = endmatch.groups()[0].strip()
+ # else:
+ # print(f" ! - {year} NO endmatch")
+ # endpara = ""
+
+ # # print(f" - endpara:\n'{endpara}'")
+ # if len(endpara) > 0:
+ # endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
+ # with open(endpath, "w") as end:
+ # end.write(endpara + "\n")
+
+ # tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+ # logbook_entry_count = 0
+ # for trippara in tripparas:
+ # logbook_entry_count += 1
+ # tid = set_trip_id(year, logbook_entry_count)
+ # # print(f" #0 - tid: {tid}")
+ # try:
+ # # print(f" #1 - tid: {tid}")
+ # s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+ # if not s:
+ # message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+ # break
+ # try:
+ # tripheader, triptext = s.group(1), s.group(2)
+ # except:
+ # message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+
+ # # mtripid = re.search(r'<a id="(.*?)"', tripheader)
+ # # if not mtripid:
+ # # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
+ # # DataIssue.objects.create(parser='logbooks', message=message)
+ # # logdataissues[tid]=message
+ # # print(message)
+
+ # # tripid = mtripid and mtripid.group(1) or ""
+ # # print(f" # - mtripid: {mtripid}")
+ # tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
+ # # print(f" #2 - tid: {tid}")
+ # try:
+ # tripdate, triptitle, trippeople = tripheader.split("|")
+ # except:
+ # message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+ # try:
+ # tripdate, triptitle = tripheader.split("|")
+ # trippeople = "GUESS ANON"
+ # except:
+ # message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+ # break
+ # # print(f" #3 - tid: {tid}")
+ # ldate = ParseDate(tripdate.strip(), year)
+ # # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
+ # # print(f" #4 - tid: {tid}")
+
+ # mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
+ # if mtu:
+ # tu = mtu.group(1)
+ # triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
+ # else:
+ # tu = ""
+
+ # triptitles = triptitle.split(" - ")
+ # tripcave = triptitles[0].strip()
+
+ # ltriptext = triptext
+
+ # mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+ # if mtail:
+ # ltriptext = ltriptext[: mtail.start(0)]
+ # ltriptext = re.sub(r"</p>", "", ltriptext)
+ # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+ # ltriptext = re.sub(r"</?u>", "_", ltriptext)
+ # ltriptext = re.sub(r"</?i>", "''", ltriptext)
+ # ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+ # ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+
+ # if ltriptext == "":
+ # message = " ! - Zero content for logbook entry!: " + tid
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+
+ # entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
+ # logentries.append(entrytuple)
+
+ # except:
+ # message = f" ! - Skipping logentry {year} due to exception in: {tid}"
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
# print(message)
-
- # tripid = mtripid and mtripid.group(1) or ""
- # print(f" # - mtripid: {mtripid}")
- tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
- # print(f" #2 - tid: {tid}")
- try:
- tripdate, triptitle, trippeople = tripheader.split("|")
- except:
- message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
- try:
- tripdate, triptitle = tripheader.split("|")
- trippeople = "GUESS ANON"
- except:
- message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
- break
- # print(f" #3 - tid: {tid}")
- ldate = ParseDate(tripdate.strip(), year)
- # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
- # print(f" #4 - tid: {tid}")
-
- mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
- if mtu:
- tu = mtu.group(1)
- triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
- else:
- tu = ""
-
- triptitles = triptitle.split(" - ")
- tripcave = triptitles[0].strip()
-
- ltriptext = triptext
-
- mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
- if mtail:
- ltriptext = ltriptext[: mtail.start(0)]
- ltriptext = re.sub(r"</p>", "", ltriptext)
- ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub(r"</?u>", "_", ltriptext)
- ltriptext = re.sub(r"</?i>", "''", ltriptext)
- ltriptext = re.sub(r"</?b>", "'''", ltriptext)
- ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
-
- if ltriptext == "":
- message = " ! - Zero content for logbook entry!: " + tid
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
-
- entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
- logentries.append(entrytuple)
-
- except:
- message = f" ! - Skipping logentry {year} due to exception in: {tid}"
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
- errorcount += 1
- raise
- if errorcount > 5:
- message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
- DataIssue.objects.create(parser="logbooks", message=message)
- logdataissues[tid] = message
- print(message)
- return
+ # errorcount += 1
+ # raise
+ # if errorcount > 5:
+ # message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
+ # DataIssue.objects.create(parser="logbooks", message=message)
+ # logdataissues[tid] = message
+ # print(message)
+ # return
def parser_blog(year, expedition, txt, sq=""):