1 files changed, 66 insertions, 10 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 4310fdb..46aba96 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -21,6 +21,39 @@ Parses and imports logbooks in all their wonderful confusion
 # When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
 # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
 '''
+todo='''
+- Put the object store 'trips' and the 'logdataissues' into TROG global object
+
+- refactor everything with some urgency, esp. LoadLogbookForExpedition()
+
+- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
+
+- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, 
+  or it is broken/incomplete and need hand-editing.
+
+- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
+  we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
+  volume of code here substantially.
+
+- edit LoadLogbooks() to use coroutines to speed up import substantially,
+  but perhaps we had better profile it first?
+  
+- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
+
+- the object store will need additional functions to replicate the persontrip calculation 
+  and storage. For the moment we leave all that to be done in the django db
+  
+- We should ensure logbook.html is utf-8 and stop this crap:             
+            file_in = open(logbookfile,'rb')
+            txt = file_in.read().decode("latin1")
+'''
+
+logentries = [] # the entire logbook for one year is a single object: a list of entries
+noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
+        'base camp', 'basecamp', 'top camp', 'topcamp' ]
+logdataissues = {}
+trips ={}
+
 
 #
 # the logbook loading section
@@ -77,12 +110,6 @@ def GetTripCave(place):
         return None
 
 
-logentries = [] # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
-        'base camp', 'basecamp', 'top camp', 'topcamp' ]
-logdataissues = {}
-trips ={}
-
 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
     """ saves a logbook entry and related persontrips 
     Does NOT save the expeditionday_id  - all NULLs. why?
@@ -205,8 +232,10 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
        #print(" - New id ",tid)
     else:
         tid= tripid1
+
     if tid in trips:
-        msg = "   ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
+        tyear, tdate, *trest = trips[tid]
+        msg = f"   ! DUPLICATE on {tdate} id: '{tid}'"
         print(msg)
         DataIssue.objects.create(parser='logbooks', message=msg)
         tid= "d{}-s{:02d}".format(str(date),seq)
@@ -427,6 +456,7 @@ def LoadLogbookForExpedition(expedition, expect):
     """
     # absolutely horrid. REFACTOR THIS (all my fault..)
     global logentries
+    global logdataissues
     logbook_parseable = False
     logbook_cached = False
     yearlinks   = settings.LOGBOOK_PARSER_SETTINGS
@@ -445,6 +475,26 @@ def LoadLogbookForExpedition(expedition, expect):
             return False
         return True
     
+    def cleanerrors(year):
+        global logdataissues
+        print(f'   - CLEAN {year} {len(logdataissues)} data issues in total')
+        dataissues = DataIssue.objects.filter(parser='logbooks')
+        for di in dataissues:
+            ph = "t" + year + "-"
+            if re.search(ph, di.message) is not None:
+                print(f'   - CLEANING dataissue {di.message}')
+                di.delete()
+     
+        for te, content in logdataissues:
+            #  tripentry = year + "." + str(logbook_entry_count)
+            print(f'   - CLEAN {te}')
+            if te.startswith(year + "."):
+                print(f'   - CLEANING logdataissue {te}')
+                logdataissues.pop(te)
+
+
+    cleanerrors(expedition.year)
+
     if expedition.year in yearlinks:
         logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
         expedition.logbookfile = yearlinks[expedition.year][0] 
@@ -478,10 +528,10 @@ def LoadLogbookForExpedition(expedition, expect):
                 print("  -- Loaded ", len(logentries), " log entries")
                 logbook_cached = True
             else:
-                print("  !- Should be ", expect, " but ", len(logentries), " found in cache")
+                print("  !- Told to expect ", expect, " but ", len(logentries), " found in cache")
                 raise
         except:
-            print("   ! Failed to load corrupt cache. Deleting it.")
+            print("   ! Failed to load corrupt cache.  (Or I was told to ignore it). Deleting it.")
             os.remove(cache_filename)
             logentries=[]
             raise
@@ -554,7 +604,7 @@ def LoadLogbooks():
             TROG['pagecache']['expedition'][expo.year] = None # clear cache
             if expo.year not in nologbook:
                 print((" - Logbook for: " + expo.year))
-                numentries = LoadLogbookForExpedition(expo, entries[expo.year])
+                numentries = LoadLogbookForExpedition(expo, entries[expo.year])  # this actually loads the logbook for one year
                 log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
                 nlbe[expo.year]=numentries
                 expd[expo.year]= 0
@@ -588,6 +638,12 @@ locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
 caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
 
 def parseAutoLogBookEntry(filename):
+    '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
+    which is then stored in a separate location to the usual logbook.html 
+    But when importing logbook.html all these individual entries also need ot be parsed.
+    
+    This is all redundant as we are getting rid of the whole individual trip entry system
+    '''
     errors = []
     f = open(filename, "r")
     contents = f.read()