diff options
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 34 |
1 files changed, 29 insertions, 5 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index e37780c..d194a5e 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -51,7 +51,7 @@ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 BLOG_PARSER_SETTINGS = { # "2022": ("ukcavingblog.html", "parser_blog"), "2019": ("ukcavingblog.html", "parser_blog"), - "2018": ("ukcavingblog.html", "parser_blog"), +# "2018": ("ukcavingblog.html", "parser_blog"), # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html } DEFAULT_LOGBOOK_FILE = "logbook.html" @@ -83,7 +83,7 @@ LOGBOOK_PARSER_SETTINGS = { "1982": ("log.htm", "parser_html_01"), } -entries = { "2022": 86, "2019": 56, "2018": 86, "2017": 76, "2016": 83, "2015": 79, +entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": 79, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, @@ -138,6 +138,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): tripperson = "Nadia" if tripperson =="tcacrossley": tripperson = "Tom Crossley" + if tripperson =="Samouse1": + tripperson = "Todd Rye" + personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) @@ -497,6 +500,13 @@ def parser_blog(year, expedition, txt, sq=""): This uses some of the more obscure capabilities of regular expressions, see https://docs.python.org/3/library/re.html + + BLOG entries have this structure: + <article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780"> + <article class="message-body js-selectToQuote"> + </article> + </article> + So the content is nested inside the header. Attachments (images) come after the content. ''' global logentries global logdataissues @@ -508,19 +518,26 @@ def parser_blog(year, expedition, txt, sq=""): print(message) # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html - tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt) + tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt) if not ( tripparas ) : message = f" ! - Skipping on failure to parse article content: {txt[:500]}" print(message) if (len(tripheads) !=len(tripparas)): print(f"{len(tripheads)} != {len(tripparas)}") + print(f"{len(tripheads)} - {len(tripparas)}") location = "Plateau" # best guess, fix manually later tu = 0 logbook_entry_count = 0 for i in range(0, len(tripparas)): - tripcontent = tripparas[i] + tripstuff = tripparas[i] + attach = tripstuff[2] + # note use on non-greedy *? regex idiom here + attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)","",attach) + attach = re.sub(r"<footer[\s\S]*(</footer>)","",attach) + tripcontent = tripstuff[0] + attach + #print(f"{i} - {len(tripstuff)} - {tripstuff[1]}") triphead = tripheads[i] logbook_entry_count += 1 tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq @@ -684,8 +701,15 @@ def LoadLogbook(year): nlbe={} TROG['pagecache']['expedition'][year] = None # clear cache - expo = Expedition.objects.get(year=year) + expo = Expedition.objects.get(year=year) + year = expo.year # some type funny nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo + if year in BLOG_PARSER_SETTINGS: + print("BLOG parsing") + LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year] + nlbe[expo] = LoadLogbookForExpedition(expo, clean=False) # this loads the blog logbook for one expo + else: + print(f" {year} not in {BLOG_PARSER_SETTINGS}") def LoadLogbooks(): """ This is the master function for parsing all logbooks into the Troggle database. |