From cb50528e2d6bc7a215a8b55b46a9859aae7f4f83 Mon Sep 17 00:00:00 2001
From: Philip Sargent
Date: Wed, 14 Dec 2022 23:46:14 +0000
Subject: exptl parse UK Caving blog
---
parsers/logbooks.py | 132 +++++++++++++++++++++++++++++-----------------------
1 file changed, 75 insertions(+), 57 deletions(-)
(limited to 'parsers/logbooks.py')
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 0bbc23d..7e2870b 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
+ "2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
@@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
- message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
+ message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
@@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt):
print(message)
return
-# parser for 2003. Retired after conversion of the logbook.html
-# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
-# def parser_html_03(year, expedition, txt):
- # global logentries
- # global logdataissues
-
- # tripparas = re.findall(r"
([\s\S]*?)(?=
(.*?)
(.*)$", trippara)
- # if not ( s ) :
- # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
- # DataIssue.objects.create(parser='logbooks', message=message)
- # logdataissues[tid]=message
- # print(message)
- # break
-
- # tripheader, triptext = s.group(1), s.group(2)
- # tripheader = re.sub(r" ", " ", tripheader)
- # tripheader = re.sub(r"\s+", " ", tripheader).strip()
- # sheader = tripheader.split(" -- ")
- # tu = ""
- # if re.match("T/U|Time underwater", sheader[-1]):
- # tu = sheader.pop() # not a number in 2003 usually
- # # print(f" - {logbook_entry_count} '{tu}' ")
- # if len(sheader) != 3:
- # print(" ! Header not three pieces for parser_html_03() ", sheader)
- # tripdate, triptitle, trippeople = sheader
- # ldate = ParseDate(tripdate.strip(), year)
- # # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
- # # print(f" - {logbook_entry_count} '{trippeople}' ")
- # titlelist = triptitle.split(" , ")
- # if len(titlelist) >= 2:
- # location, *namelist = titlelist # list unpacking operator
- # tripname = ", ".join(namelist) # concatenate strings
- # # print(f" - {logbook_entry_count} {location} '{tripname}'")
- # else:
- # location = "UNKNOWN"
-
- # ltriptext = triptext + "
\n\n" + tu
- # ltriptext = re.sub(r"", "", ltriptext)
- # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
- # ltriptext = re.sub(r"", "
\n\n", ltriptext).strip()
- # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+def parser_blog(year, expedition, txt):
+ '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
+ Note that the entries have dates and authors, but no titles.
+ '''
+ global logentries
+ global logdataissues
+ errorcount = 0
+ tripheads = re.findall(r"\s*([\s\S]*?)(?=