parse several UK caving blogs per year - working

author: Philip Sargent <philip.sargent@gmail.com> 2022-12-16 19:57:56 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2022-12-16 19:57:56 +0000
commit: f80e4efed8b6a329a7c92b9e0c68bb12faa9b517 (patch)
tree: 76503a47a8926f2c026887c3c149d65cda778870 /parsers/logbooks.py
parent: 5e9fd7fd77f2e94f433e9fa530b3c3e098d3dfa9 (diff)
download: troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.tar.gz
troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.tar.bz2
troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.zip
1 files changed, 106 insertions, 67 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index ccd935f..25da271 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -51,8 +51,13 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
 
 '''
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
-DEFAULT_LOGBOOK_PARSER = "parser_html"
+BLOG_PARSER_SETTINGS = {
+                "2017": ("ukcavingblog.html", "parser_blog"), 
+                "2019": ("ukcavingblog.html", "parser_blog"), 
+                "2022": ("ukcavingblog.html", "parser_blog"), 
+            }
 DEFAULT_LOGBOOK_FILE = "logbook.html"
+DEFAULT_LOGBOOK_PARSER = "parser_html"
 # All years since 2010 use the default value for Logbook parser
 # but several don't work, and are skipped by the parsing code, e.g. 1983
 LOGBOOK_PARSER_SETTINGS = {
@@ -89,11 +94,11 @@ LOGBOOK_PARSER_SETTINGS = {
                 "1982": ("log.htm", "parser_html_01"), 
             }
 
-entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
+entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 61, "2016": 81, "2015": 79, 
     "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
     "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
-    "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
-    "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
+    "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, 
+    "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
     "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
 # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
 
@@ -258,7 +263,7 @@ def ParseDate(tripdate, year):
         return datetime.date(1970, 1, 1)
 
 # (2006 - not any more), 2008 - 2009
-def wiki_parser(year, expedition, txt):
+def wiki_parser(year, expedition, txt, seq=""):
     global logentries
     global logdataissues
 
@@ -300,10 +305,20 @@ def wiki_parser(year, expedition, txt):
         
 # 2002, 2004, 2005, 2007, 2010 - now
 # 2006 wiki text is incomplete, but the html all there. So using this parser now.
-def parser_html(year, expedition, txt):
+def parser_html(year, expedition, txt, seq=""):
     global logentries
     global logdataissues
 
+    # extract front material and stash for later use when rebuilding from list of entries
+    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
+    headpara = headmatch.groups()[0].strip()
+    
+    # print(f" - headpara:\n'{headpara}'")
+    if(len(headpara)>0):
+        frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
+        with open(frontpath,"w") as front:
+            front.write(headpara+"\n")
+    
     tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
     logbook_entry_count = 0
     for trippara in tripparas:
@@ -323,7 +338,7 @@ def parser_html(year, expedition, txt):
         if s:
             tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
         else: # allow title and people to be swapped in order
-            msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..."
+            msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
             print(msg)
             DataIssue.objects.create(parser='logbooks', message=msg)
             logdataissues[tid]=msg
@@ -340,11 +355,11 @@ def parser_html(year, expedition, txt):
             if s2:
                 tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
             else:
-                if not re.search(r"Rigging Guide", trippara):
-                    msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..."
-                    print(msg)
-                    DataIssue.objects.create(parser='logbooks', message=msg)
-                    logdataissues[tid]=msg
+                # if not re.search(r"Rigging Guide", trippara):
+                msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
+                print(msg)
+                DataIssue.objects.create(parser='logbooks', message=msg)
+                logdataissues[tid]=msg
                 continue
         
         ldate = ParseDate(tripdate.strip(), year)
@@ -364,11 +379,21 @@ def parser_html(year, expedition, txt):
 
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
-def parser_html_01(year, expedition, txt):
+def parser_html_01(year, expedition, txt, seq=""):
     global logentries
     global logdataissues
     errorcount = 0
-
+    
+    # extract front material and stash for later use when rebuilding from list of entries
+    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
+    headpara = headmatch.groups()[0].strip()
+    
+    # print(f" - headpara:\n'{headpara}'")
+    if(len(headpara)>0):
+        frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
+        with open(frontpath,"w") as front:
+            front.write(headpara+"\n")
+    
     tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
     logbook_entry_count = 0
     for trippara in tripparas:
@@ -472,8 +497,8 @@ def parser_html_01(year, expedition, txt):
                 print(message)
                 return
 
-def parser_blog(year, expedition, txt):
-    '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
+def parser_blog(year, expedition, txt, sq=""):
+    '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
     Note that the entries have dates and authors, but no titles.
     '''
     global logentries
@@ -494,14 +519,13 @@ def parser_blog(year, expedition, txt):
         print(f"{len(tripheads)} != {len(tripparas)}")
 
     location = "Plateau"
-    tripname = "UK Caving Blog post"
     tu = 0
     logbook_entry_count = 0
     for i in range(0, len(tripparas)):
         trippara = tripparas[i]
         triphead = tripheads[i]
         logbook_entry_count += 1
-        tid = set_trip_id(year,logbook_entry_count) +"_blog"
+        tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
         # print(f" -  tid: {tid}")
         
         # data-author="tcacrossley"
@@ -514,7 +538,7 @@ def parser_blog(year, expedition, txt):
             break
         trippeople = match_author.group(1)
         # print(f" -  tid: {tid} {trippeople}")
-       # datetime="2019-07-11T13:16:18+0100"
+        # datetime="2019-07-11T13:16:18+0100"
         match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
         if not ( match_datetime ) :
             message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime  {tid} {triphead[:400]}..."
@@ -527,19 +551,25 @@ def parser_blog(year, expedition, txt):
         try:
             tripdate = datetime.fromisoformat(datestamp)
         except:
-            print(datestamp[0:9])
+            message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
+            DataIssue.objects.create(parser='logbooks', message=message)
+            logdataissues[tid]=message
+            print(message)
+            # fallback, ignore the timestamp bits:
             tripdate = datetime.fromisoformat(datestamp[0:10])
-       # print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")
+        print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")
         
-        tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
+        tripname = f"UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
+        tripcontent = trippara + f"\n\nBlog Author: {trippeople}"
 
-        entrytuple = (tripdate, location, tripname, trippara, 
+        entrytuple = (tripdate, location, tripname, tripcontent, 
             trippeople, expedition, tu, tid)
         logentries.append(entrytuple)
 
     
 def LoadLogbookForExpedition(expedition, clean=True):
     """ Parses all logbook entries for one expedition 
+    if clean==True then it deletes all entries for this year first.
     """
     global logentries
     # absolutely horrid. REFACTOR THIS (all my fault..)
@@ -580,13 +610,13 @@ def LoadLogbookForExpedition(expedition, clean=True):
 
     if year in yearlinks:
         yearfile, yearparser = yearlinks[year]
-        logbookpath = Path(expologbase) /  year / yearfile
+        logbookpath =  Path(yearfile)
         expedition.logbookfile = yearfile 
         parsefunc   = yearparser
         # print(f" - Logbook file {yearfile} using parser {yearparser}")
 
     else:
-        logbookpath = Path(expologbase) /  year / DEFAULT_LOGBOOK_FILE
+        logbookpath = Path(DEFAULT_LOGBOOK_FILE)
         expedition.logbookfile = DEFAULT_LOGBOOK_FILE
         parsefunc   = DEFAULT_LOGBOOK_PARSER
 
@@ -597,34 +627,39 @@ def LoadLogbookForExpedition(expedition, clean=True):
         for lbe in lbes:
             lbe.delete()
 
-    try:
-        file_in = open(logbookpath,'rb')
-        txt = file_in.read().decode("utf-8")
-        file_in.close()
-        logbook_parseable = True
-    except (IOError):
-        logbook_parseable = False
-        print("   ! Couldn't open logbook as UTF-8 " + logbookpath)
-    except:
-        logbook_parseable = False
-        print("   ! Very Bad Error opening " + logbookpath)
-
-    if logbook_parseable:
-        parser = globals()[parsefunc]
-        print(f' - {year} parsing with {parsefunc}')
-        parser(year, expedition, txt) # this launches the right parser for this year
-        
-    i=0
-    for entrytuple in logentries:
-        # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+    for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
+        lb = Path(expologbase,  year, logbookpath.stem + sq + logbookpath.suffix)
+        if not (lb.is_file()):
+            # print(f"   ! End of blog. Next blog file in sequence not there:{lb}")
+            break
         try:
-            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-        except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
-            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
-            print(f'   - Exception entry_type "{entry_type}" {tripid1}')
-        EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
-                tripid1)
-        i +=1
+            with open(lb,'rb') as file_in:
+                txt = file_in.read().decode("utf-8")
+            logbook_parseable = True
+        except (IOError):
+            logbook_parseable = False
+            print(f"   ! Couldn't open logbook as UTF-8 {lb}")
+        except:
+            logbook_parseable = False
+            print(f"   ! Very Bad Error opening {lb}")
+
+        if logbook_parseable:
+            
+            # --------------------
+            parser = globals()[parsefunc]
+            print(f' - {year} parsing with {parsefunc} - {lb}')
+            parser(year, expedition, txt, sq) # this launches the right parser for this year
+            # --------------------
+           
+        for entrytuple in logentries:
+            # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+            try:
+                date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+            except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
+                date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
+                print(f'   - Exception entry_type "{entry_type}" {tripid1}')
+            EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
+                    tripid1)
     
     if len(logentries) == expect:
         # print(f"OK  {year} {len(logentries):5d} is {expect}\n")
@@ -634,19 +669,19 @@ def LoadLogbookForExpedition(expedition, clean=True):
 
     return len(logentries)
 
-def LoadLogbook(year, format="cucc"):
-    global LOGBOOK_PARSER_SETTINGS
+# def LoadLogbook(year, format="cucc"):
+    # global LOGBOOK_PARSER_SETTINGS
      
-    nlbe={}
-    TROG['pagecache']['expedition'][year] = None # clear cache
+    # nlbe={}
+    # TROG['pagecache']['expedition'][year] = None # clear cache
     
-    expo = Expedition.objects.get(year=year)
+    # expo = Expedition.objects.get(year=year)
     
-    if (format=="blog"):
-        LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
-    # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
+    # if (format=="blog"):
+        # LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)] 
+    # # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
     
-    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+    # nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
 
 def LoadLogbooks():
     """ This is the master function for parsing all logbooks into the Troggle database. 
@@ -671,7 +706,7 @@ def LoadLogbooks():
     sqlfail =     ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
     nologbook = noexpo + lostlogbook + sqlfail
 
-    blogs = ["2019"]
+    # blogs = ["2019"]
 
     nlbe={}
     expd ={}
@@ -694,17 +729,21 @@ def LoadLogbooks():
             else:
                 print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
                 
-        if year in blogs:
+        if year in BLOG_PARSER_SETTINGS:
             bloglist.append(expo)
 
  
     for ex in loglist:
-        nlbe[ex] = LoadLogbookForExpedition(ex)  # this actually loads the logbook for one expo
+        nlbe[ex] = LoadLogbookForExpedition(ex)  # this  loads the logbook for one expo
 
     for b in bloglist:
-        orig = LOGBOOK_PARSER_SETTINGS[str(b)]
-        LOGBOOK_PARSER_SETTINGS[str(b)] = ("ukcavingblog.html", "parser_blog")
-        nlbe[b] = LoadLogbookForExpedition(b, clean=False)  # this actually loads the logbook for one expo
+        if str(b) in LOGBOOK_PARSER_SETTINGS:
+            orig = LOGBOOK_PARSER_SETTINGS[str(b)]
+        else:
+            orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
+        LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)] 
+        print(f" - BLOG: {b}") 
+        nlbe[b] = LoadLogbookForExpedition(b, clean=False)  # this  loads the blog logbook for one expo
         LOGBOOK_PARSER_SETTINGS[str(b)] = orig
 
     # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
author	Philip Sargent <philip.sargent@gmail.com>	2022-12-16 19:57:56 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2022-12-16 19:57:56 +0000
commit	f80e4efed8b6a329a7c92b9e0c68bb12faa9b517 (patch)
tree	76503a47a8926f2c026887c3c149d65cda778870 /parsers/logbooks.py
parent	5e9fd7fd77f2e94f433e9fa530b3c3e098d3dfa9 (diff)
download	troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.tar.gz troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.tar.bz2 troggle-f80e4efed8b6a329a7c92b9e0c68bb12faa9b517.zip