1 files changed, 35 insertions, 65 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 3b01eed..838c253 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -10,7 +10,7 @@ from django.template.defaultfilters import slugify
 from parsers.people import GetPersonExpeditionNameLookup
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import TROG, save_carefully
+from troggle.core.utils import save_carefully
 
 """
 Parses and imports logbooks in all their wonderful confusion
@@ -18,12 +18,16 @@ Parses and imports logbooks in all their wonderful confusion
     https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
-- refactor everything with some urgency, esp. LoadLogbookForExpedition()
+- refactor everything with some urgency, esp. parse_logbook_for_expedition()
 
-- remove the TROG things since we need the database for multiuser access? Or not?
+- break out the code that hits the database from that which parses the logbook
+so that the file-reading and parsing can be parallelized, while writing to the
+database remains serialized (sqlite is single-user).
 
 - profile the code to find bad repetitive things, of which there are many.
 
+- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
+
 - far too many uses of Django field dereferencing to get values, which is SLOW
 
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -96,12 +100,8 @@ entries = {
 
 logentries = []  # the entire logbook for one year is a single object: a list of entries
 noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
-logdataissues = TROG["issues"]["logdataissues"]
 trips = {}
 
-#
-# the logbook loading section
-#
 def set_trip_id(year, seq):
     tid = f"{year}_s{seq:02d}"
     return tid
@@ -149,7 +149,6 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                 message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
                 print(message)
                 DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
             res.append((personyear, logtime_underground))
             if mul:
                 author = personyear
@@ -163,7 +162,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
 
 
 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
-    """saves a logbook entry and related persontrips
+    """saves a single logbook entry and related persontrips
     Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
 
     troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
@@ -193,7 +192,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
     except:
         message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
         DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["title"] = message
         print(message)
         raise
         return
@@ -201,7 +199,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
     if not author:
         message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
         DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["title"] = message
         print(message)
         # return
 
@@ -261,7 +258,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
 
 def ParseDate(tripdate, year):
     """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
-    dummydate = date(1970, 1, 1)
+    dummydate = date(1970, 1, 1) # replace with _EPOCH
     month = 1
     day = 1
     # message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
@@ -273,7 +270,6 @@ def ParseDate(tripdate, year):
             if not (mdatestandard.group(1) == year):
                 message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
                 DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues["tripdate"] = message
                 return dummydate
             else:
                 year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
@@ -281,23 +277,20 @@ def ParseDate(tripdate, year):
             if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
                 message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
                 DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues["tripdate"] = message
                 return dummydate
             else:
                 yadd = int(year[:2]) * 100
                 day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
         else:
-            year = 1970
+            year = 1970 # replace with _EPOCH
             message = f" ! - Bad date in logbook: {tripdate} - {year}"
             DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues["tripdate"] = message
 
         return date(year, month, day)
     except:
         message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
         DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["tripdate"] = message
-        return datetime.date(1970, 1, 1)
+        return datetime.date(1970, 1, 1) # replace with _EPOCH
 
 
 def parser_html(year, expedition, txt, seq=""):
@@ -309,7 +302,6 @@ def parser_html(year, expedition, txt, seq=""):
     from parser_html_01 format logfiles, believe me.
     """
     global logentries
-    global logdataissues
 
     # extract front material and stash for later use when rebuilding from list of entries
     headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@@ -356,7 +348,6 @@ def parser_html(year, expedition, txt, seq=""):
             msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
             print(msg)
             DataIssue.objects.create(parser="logbooks", message=msg)
-            logdataissues[tid] = msg
 
             s2 = re.match(
                 r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
@@ -377,7 +368,6 @@ def parser_html(year, expedition, txt, seq=""):
                 msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
                 print(msg)
                 DataIssue.objects.create(parser="logbooks", message=msg)
-                logdataissues[tid] = msg
                 continue
 
         ldate = ParseDate(tripdate.strip(), year)
@@ -413,7 +403,6 @@ def parser_blog(year, expedition, txt, sq=""):
     So the content is nested inside the header. Attachments (images) come after the content.
     """
     global logentries
-    global logdataissues
 
     tripheads = re.findall(
         r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@@ -455,7 +444,6 @@ def parser_blog(year, expedition, txt, sq=""):
         if not (match_author):
             message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author  {tid} {triphead[:400]}..."
             DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
             print(message)
             break
         trippeople = match_author.group(1)
@@ -465,7 +453,6 @@ def parser_blog(year, expedition, txt, sq=""):
         if not (match_datetime):
             message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime  {tid} {triphead[:400]}..."
             DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
             print(message)
             break
         datestamp = match_datetime.group(1)
@@ -475,7 +462,6 @@ def parser_blog(year, expedition, txt, sq=""):
         except:
             message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
             DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
             print(message)
             # fallback, ignore the timestamp bits:
             tripdate = datetime.fromisoformat(datestamp[0:10])
@@ -494,14 +480,30 @@ def parser_blog(year, expedition, txt, sq=""):
         entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
         logentries.append(entrytuple)
 
+def clean_logbook_for_expedition(expedition):
+    def cleanerrors(year):
+        dataissues = DataIssue.objects.filter(parser="logbooks")
+        for di in dataissues:
+            ph = year
+            if re.search(ph, di.message) is not None:  # SLOW just to delete issues for one year
+                # print(f'   - CLEANING dataissue {di.message}')
+                di.delete()
+
+            
+
+    year = expedition.year
+    cleanerrors(year)
+    
+    lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
+    for lbe in lbes:
+        lbe.delete()
 
-def LoadLogbookForExpedition(expedition, clean=True):
+def parse_logbook_for_expedition(expedition):
     """Parses all logbook entries for one expedition
     if clean==True then it deletes all entries for this year first.
     """
     global logentries
     # absolutely horrid. REFACTOR THIS (all my fault..)
-    global logdataissues
     global entries
 
     logbook_parseable = False
@@ -513,28 +515,6 @@ def LoadLogbookForExpedition(expedition, clean=True):
     expect = entries[year]
     # print(" - Logbook for: " + year)
 
-    def cleanerrors(year):
-        global logdataissues
-        dataissues = DataIssue.objects.filter(parser="logbooks")
-        for di in dataissues:
-            ph = year
-            if re.search(ph, di.message) is not None:
-                # print(f'   - CLEANING dataissue {di.message}')
-                di.delete()
-
-        # print(f'   - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
-        dellist = []
-        for key, value in logdataissues.items():
-            # print(f'   - CLEANING logdataissues [{key}]: {value}')
-            if key.startswith(year):
-                # print(f'   - CLEANING logdataissues [{key:12}]: {value} ')
-                dellist.append(key)
-        for i in dellist:
-            del logdataissues[i]
-
-    if clean:
-        cleanerrors(year)
-
     if year in yearlinks:
         yearfile, yearparser = yearlinks[year]
         logbookpath = Path(yearfile)
@@ -549,11 +529,6 @@ def LoadLogbookForExpedition(expedition, clean=True):
 
     expedition.save()
 
-    lbes = LogbookEntry.objects.filter(expedition=expedition)
-    if clean:
-        for lbe in lbes:
-            lbe.delete()
-
     for sq in ["", "2", "3", "4"]:  # cope with blog saved as many separate files
         lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
         if not (lb.is_file()):
@@ -603,15 +578,15 @@ def LoadLogbook(year):
     global LOGBOOK_PARSER_SETTINGS
 
     nlbe = {}
-    TROG["pagecache"]["expedition"][year] = None  # clear cache
 
     expo = Expedition.objects.get(year=year)
     year = expo.year  # some type funny
-    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+    clean_logbook_for_expedition(expo)
+    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
     if year in BLOG_PARSER_SETTINGS:
         print("BLOG parsing")
         LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
-        nlbe[expo] = LoadLogbookForExpedition(expo, clean=False)  # this  loads the blog logbook for one expo
+        nlbe[expo] = parse_logbook_for_expedition(expo)  # this  loads the blog logbook for one expo
     else:
         print(
             f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
@@ -623,16 +598,13 @@ def LoadLogbooks():
     This should be rewritten to use coroutines to load all logbooks from disc in parallel,
     but must be serialised to write to database as sqlite is single-user.
     """
-    global logdataissues
     global entries
 
-    logdataissues = {}
     DataIssue.objects.filter(parser="logbooks").delete()
     expos = Expedition.objects.all()
     if len(expos) <= 1:
         message = " ! - No expeditions found. Load 'people' first"
         DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["sqlfail 0000"] = message
         print(message)
         return
 
@@ -651,12 +623,10 @@ def LoadLogbooks():
 
     for expo in expos:  # pointless as we explicitly know the years in this code.
         year = expo.year
-        TROG["pagecache"]["expedition"][year] = None  # clear cache
         if year in sqlfail:
             print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
             message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
             DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[f"sqlfail {year}"] = message
             print(message)
 
         if year not in nologbook:
@@ -669,7 +639,7 @@ def LoadLogbooks():
             bloglist.append(expo)
 
     for ex in loglist:
-        nlbe[ex] = LoadLogbookForExpedition(ex)  # this  loads the logbook for one expo
+        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
 
     for b in bloglist:
         if str(b) in LOGBOOK_PARSER_SETTINGS:
@@ -678,12 +648,12 @@ def LoadLogbooks():
             orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
         LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
         print(f" - BLOG: {b}")
-        nlbe[b] = LoadLogbookForExpedition(b, clean=False)  # this  loads the blog logbook for one expo
+        nlbe[b] = parse_logbook_for_expedition(b, clean=False)  # this  loads the blog logbook for one expo
         LOGBOOK_PARSER_SETTINGS[str(b)] = orig
 
     # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
     # yt = 0
-    # for r in map(LoadLogbookForExpedition, loglist):
+    # for r in map(parse_logbook_for_expedition, loglist):
     # yt = r
 
     yt = 0