From f80e4efed8b6a329a7c92b9e0c68bb12faa9b517 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Fri, 16 Dec 2022 19:57:56 +0000 Subject: parse several UK caving blogs per year - working --- parsers/logbooks.py | 173 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 67 deletions(-) (limited to 'parsers/logbooks.py') diff --git a/parsers/logbooks.py b/parsers/logbooks.py index ccd935f..25da271 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -51,8 +51,13 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures ''' MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 -DEFAULT_LOGBOOK_PARSER = "parser_html" +BLOG_PARSER_SETTINGS = { + "2017": ("ukcavingblog.html", "parser_blog"), + "2019": ("ukcavingblog.html", "parser_blog"), + "2022": ("ukcavingblog.html", "parser_blog"), + } DEFAULT_LOGBOOK_FILE = "logbook.html" +DEFAULT_LOGBOOK_PARSER = "parser_html" # All years since 2010 use the default value for Logbook parser # but several don't work, and are skipped by the parsing code, e.g. 1983 LOGBOOK_PARSER_SETTINGS = { @@ -89,11 +94,11 @@ LOGBOOK_PARSER_SETTINGS = { "1982": ("log.htm", "parser_html_01"), } -entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79, +entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 61, "2016": 81, "2015": 79, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, - "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, - "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, + "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, + "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1, "1985": 24, "1984": 32, "1983": 52, "1982": 42,} # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. @@ -258,7 +263,7 @@ def ParseDate(tripdate, year): return datetime.date(1970, 1, 1) # (2006 - not any more), 2008 - 2009 -def wiki_parser(year, expedition, txt): +def wiki_parser(year, expedition, txt, seq=""): global logentries global logdataissues @@ -300,10 +305,20 @@ def wiki_parser(year, expedition, txt): # 2002, 2004, 2005, 2007, 2010 - now # 2006 wiki text is incomplete, but the html all there. So using this parser now. -def parser_html(year, expedition, txt): +def parser_html(year, expedition, txt, seq=""): global logentries global logdataissues + # extract front material and stash for later use when rebuilding from list of entries + headmatch = re.match(r"(?i)(?s).*]*>(.*?)0): + frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") + with open(frontpath,"w") as front: + front.write(headpara+"\n") + tripparas = re.findall(r"([\s\S]*?)(?=]*>(.*?)0): + frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") + with open(frontpath,"w") as front: + front.write(headpara+"\n") + tripparas = re.findall(r"([\s\S]*?)(?=