Fixing wiki-parsing for 2009 logbook

author: Philip Sargent <philip.sargent@gmail.com> 2022-12-18 19:33:56 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2022-12-18 19:33:56 +0000
commit: d1b94763b43842e7834062a2ab68978c6d95a95e (patch)
tree: 40c35912341c838fe65d8cd4f81d1207b4418e71 /parsers/logbooks.py
parent: 73b710d53f0ea4fb4c1693679732e19a53530d1d (diff)
download: troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.tar.gz
troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.tar.bz2
troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.zip
1 files changed, 63 insertions, 39 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index cde11bd..d4db001 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -52,7 +52,7 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
 '''
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
 BLOG_PARSER_SETTINGS = {
-                # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+#               "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
                 "2018": ("ukcavingblog.html", "parser_blog"), 
                 "2019": ("ukcavingblog.html", "parser_blog"), 
                 "2022": ("ukcavingblog.html", "parser_blog"), 
@@ -60,12 +60,13 @@ BLOG_PARSER_SETTINGS = {
 DEFAULT_LOGBOOK_FILE = "logbook.html"
 DEFAULT_LOGBOOK_PARSER = "parser_html"
 # All years since 2010 use the default value for Logbook parser
-# but several don't work, and are skipped by the parsing code, e.g. 1983
 LOGBOOK_PARSER_SETTINGS = {
                 "2019": ("logbook.html", "parser_html"), 
                 "2010": ("logbook.html", "parser_html"), 
-                "2009": ("2009logbook.txt", "wiki_parser"), 
-                "2008": ("2008logbook.txt", "wiki_parser"), 
+#               "2009": ("2009logbook.txt", "wiki_parser"), # converted to html
+#               "2008": ("2008logbook.txt", "wiki_parser"), # converted to html
+                "2009": ("logbook.html", "parser_html"), 
+                "2008": ("logbook.html", "parser_html"), 
                 "2007": ("logbook.html", "parser_html"), 
                 "2006": ("logbook.html", "parser_html"), 
 #               "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html
@@ -96,15 +97,15 @@ LOGBOOK_PARSER_SETTINGS = {
             }
 
 entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79, 
-    "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
-    "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
+    "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, 
+    "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
     "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, 
     "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
     "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
 # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
 
 logentries = [] # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
+noncaveplaces = [ "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
         'base camp', 'basecamp', 'top camp', 'topcamp' ]
 logdataissues = TROG['issues']['logdataissues']
 trips ={}
@@ -170,11 +171,30 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
     """ saves a logbook entry and related persontrips 
     Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
     
-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
-    but we are saving the  same thing too many times.. 
+    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times.. 
+    
+    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
+    lookupAttribs={'date':date, 'title':title}
     """
+ 
+    # Nasty hack, must tidy this up..
+    if logtime_underground:
+        try:
+            logtime_underground = float(logtime_underground)
+        except:
+            # print(f"logtime_underground = {logtime_underground}")
+            tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
+            if tu_match:
+                # print(f"logtime_underground = {tu_match.group(2)}")
+                logtime_underground = float(tu_match.group(2))
+            else:
+                logtime_underground = 0
+    else:
+        logtime_underground = 0
+
     try:
         trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
+        # print(f" - {author} - {logtime_underground}")
     except:
         message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
         DataIssue.objects.create(parser='logbooks', message=message)
@@ -223,11 +243,13 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
         # slug = tid + "_" + slugify(title)[:10].replace('-','_')
     else: 
         slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
-    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
+    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 
+        'time_underground':logtime_underground, 'cave_slug':str(cave), 'slug': slug}
     
     # This creates the lbo instance of LogbookEntry
     lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
     
+    # for PersonTrip time_underground is float (decimal hours)
     for tripperson, time_underground in trippersons:
         # print(f" -  {tid} '{tripperson}' author:{tripperson == author}")
         lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
@@ -300,24 +322,29 @@ def wiki_parser(year, expedition, txt, seq=""):
         else:
             tripsplace = tripsplace[1]
             
-        #print(f"! LOGBOOK {year} {logbook_entry_count:2}  {len(triptext):4}  '{tripsplace}'")
 
-        tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+        #tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+        tul = re.findall(r"T/U:?\s*(\d+[.]?\d*)\s*(hr|hrs|hours)?.*", triptext)
         if tul:
             tu = tul[0][0]
         else:
             tu = ""
+        print(f"! LOGBOOK {year} {logbook_entry_count:2}  {len(triptext):4} T/U:{tu}  '{tripcave} - {tripsplace}' ")
 
         ldate = ParseDate(tripdate.strip(), year)
-        tripid =""
-        
-        entrytuple = (ldate, tripcave, tripsplace, triptext, 
+        tripid = set_trip_id(year,logbook_entry_count)
+
+        ltriptext = re.sub(r"\n", "<br /><br />\n", triptext)
+        ltriptext = ltriptext.replace("<br /><br />\n<br /><br />\n","<br /><br />\n")
+
+        triptitle = f'{tripcave} - {tripsplace}'
+        entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                 trippeople, expedition, tu, tripid)
         logentries.append(entrytuple)
         
+
         
-# 2002, 2004, 2005, 2007, 2010 - now
-# 2006 wiki text is incomplete, but the html all there. So using this parser now.
+# 2002, 2004 - now
 def parser_html(year, expedition, txt, seq=""):
     global logentries
     global logdataissues
@@ -382,7 +409,7 @@ def parser_html(year, expedition, txt, seq=""):
         else:
             tripcave = "UNKNOWN"
         ltriptext = re.sub(r"</p>", "", triptext)
-        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
         ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
 
         entrytuple = (ldate, tripcave, triptitle, ltriptext, 
@@ -665,15 +692,17 @@ def LoadLogbookForExpedition(expedition, clean=True):
             print(f' - {year} parsing with {parsefunc} - {lb}')
             parser(year, expedition, txt, sq) # this launches the right parser for this year
             # --------------------
-           
+        dupl = {}
         for entrytuple in logentries:
-            # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-            try:
-                date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-            except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
-                date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
-                print(f'   - Exception entry_type "{entry_type}" {tripid1}')
-            EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
+            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+            check = (date, triptitle)
+            if check in dupl:
+                dupl[check] += 1
+                triptitle = f"{triptitle} #{dupl[check]}"
+                print(f'  - {triptitle}')
+            else:
+                dupl[check] = 1
+            EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground,
                     tripid1)
     
     if len(logentries) == expect:
@@ -684,19 +713,16 @@ def LoadLogbookForExpedition(expedition, clean=True):
 
     return len(logentries)
 
-# def LoadLogbook(year, format="cucc"):
-    # global LOGBOOK_PARSER_SETTINGS
+def LoadLogbook(year):
+    '''One off logbook for testing purposes
+    '''
+    global LOGBOOK_PARSER_SETTINGS
      
-    # nlbe={}
-    # TROG['pagecache']['expedition'][year] = None # clear cache
-    
-    # expo = Expedition.objects.get(year=year)
-    
-    # if (format=="blog"):
-        # LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)] 
-    # # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
+    nlbe={}
+    TROG['pagecache']['expedition'][year] = None # clear cache
     
-    # nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+    expo = Expedition.objects.get(year=year)    
+    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
 
 def LoadLogbooks():
     """ This is the master function for parsing all logbooks into the Troggle database. 
@@ -721,8 +747,6 @@ def LoadLogbooks():
     sqlfail =     ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
     nologbook = noexpo + lostlogbook + sqlfail
 
-    # blogs = ["2019"]
-
     nlbe={}
     expd ={}
     loglist = []
author	Philip Sargent <philip.sargent@gmail.com>	2022-12-18 19:33:56 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2022-12-18 19:33:56 +0000
commit	d1b94763b43842e7834062a2ab68978c6d95a95e (patch)
tree	40c35912341c838fe65d8cd4f81d1207b4418e71 /parsers/logbooks.py
parent	73b710d53f0ea4fb4c1693679732e19a53530d1d (diff)
download	troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.tar.gz troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.tar.bz2 troggle-d1b94763b43842e7834062a2ab68978c6d95a95e.zip