1 files changed, 100 insertions, 173 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index d615930..6a156af 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,8 @@
 import csv
-import datetime
+from  datetime import datetime, date, time
 import os
 import re
-import time
+#import time
 import pickle
 import shelve
 
@@ -26,8 +26,6 @@ todo='''
 
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
 
-- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
-
 - Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, 
   or it is broken/incomplete and need hand-editing.
 
@@ -46,26 +44,35 @@ todo='''
 - We should ensure logbook.html is utf-8 and stop this crap:             
             file_in = open(logbookfile,'rb')
             txt = file_in.read().decode("latin1")
+            
+- this is a slow and uncertain function:  cave = getCaveByReference(caveRef)
 '''
 
 logentries = [] # the entire logbook for one year is a single object: a list of entries
 noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
         'base camp', 'basecamp', 'top camp', 'topcamp' ]
-logdataissues = {}
+logdataissues = TROG['issues']['logdataissues']
 trips ={}
 
 
 #
 # the logbook loading section
 #
-def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
+def set_trip_id(year, seq):
+    tid= f"{year}.s{seq:02d}"
+    return tid
+
+    
+def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
     res = [ ]
     author = None
     round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
-    if tid =="!":
-        tid = expedition.year + "." + tripperson
+    #print(f'# {tid}')
+       
     for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
         tripperson = tripperson.strip()
+        if not tid:
+            tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
         mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
         if mul:
             tripperson = mul.group(1).strip()
@@ -86,23 +93,24 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
         author = res[-1][0]
     return res, author
 
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
     """ saves a logbook entry and related persontrips 
     Does NOT save the expeditionday_id  - all NULLs. why?
     """
     try:
         trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
     except:
-        message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year)
+        message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
         DataIssue.objects.create(parser='logbooks', message=message)
         logdataissues["title"]=message
+        print(message)
         return
         
     if not author:
-        print(" ! - Skipping logentry: " + title + " - no author for entry")
-        message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
+        message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
         DataIssue.objects.create(parser='logbooks', message=message)
         logdataissues["title"]=message
+        print(message)
         return
 
     # This needs attention. The slug field is derived from 'title'
@@ -113,16 +121,16 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
     cave=None
     if lplace not in noncaveplaces:
         cave = GetCaveLookup().get(lplace)
-        # message = " ! - '" + lplace + "' place not in noncaveplaces."
-        # print(message)
-        # DataIssue.objects.create(parser='logbooks', message=message)
 
     #Check for an existing copy of the current entry, and save
     expeditionday = expedition.get_expedition_day(date)
     lookupAttribs={'date':date, 'title':title}
     # 'cave' is converted to a string doing this, which renders as the cave slug.
     # but it is a db query which we should try to avoid - rewrite this
-    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type}
+    
+    #NEW sluf for a logbook entry here! Use the unique id, not the title !!!
+    slug = tid + slugify(title)[:50]
+    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
     lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
 
     
@@ -133,31 +141,37 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
 
 def ParseDate(tripdate, year):
     """ Interprets dates in the expo logbooks and returns a correct datetime.date object  """
-    mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
-    mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
-    if mdatestandard:
-        if not (mdatestandard.group(1) == year):
-            message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
-            DataIssue.objects.create(parser='logbooks', message=message)
-            logdataissues["tripdate"]=message
-            return datetime.date('1970', '01', '01')
+    try:
+        mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+        mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+        if mdatestandard:
+            if not (mdatestandard.group(1) == year):
+                message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues["tripdate"]=message
+                return datetime.date('1970', '01', '01')
+            else:
+                year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+        elif mdategoof:
+            if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
+                message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues["tripdate"]=message
+                return date('1970', '01', '01')
+            else:
+                yadd = int(year[:2]) * 100
+                day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
         else:
-            year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
-    elif mdategoof:
-        if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
-            message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
+            message = " ! - Bad date in logbook: " + tripdate + " - " + year
             DataIssue.objects.create(parser='logbooks', message=message)
             logdataissues["tripdate"]=message
-            return datetime.date('1970', '01', '01')
-        else:
-            yadd = int(year[:2]) * 100
-            day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
-    else:
-        message = " ! - Bad date in logbook: " + tripdate + " - " + year
+
+        return date(year, month, day)
+    except:
+        message = " ! - Failed to parse date in logbook: " + tripdate + " - " + year
         DataIssue.objects.create(parser='logbooks', message=message)
         logdataissues["tripdate"]=message
-
-    return datetime.date(year, month, day)
+        return date(year, month, day)
 
 # (2006 - not any more), 2008 - 2009
 def Parselogwikitxt(year, expedition, txt):
@@ -168,6 +182,8 @@ def Parselogwikitxt(year, expedition, txt):
     trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
     for triphead, triptext in trippara:
         logbook_entry_count += 1
+        tid = set_trip_id(year,logbook_entry_count)
+        
         tripheadp = triphead.split("|")
         if not (len(tripheadp) == 3):
             message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
@@ -198,30 +214,24 @@ def Parselogwikitxt(year, expedition, txt):
         logentries.append(entrytuple)
 
         EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, 
-                expedition=expedition, logtime_underground=0)
+                expedition=expedition, logtime_underground=0, tid=tid)
         
         EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, 
-                tu, "wiki", tripid, logbook_entry_count)
+                tu, "wiki", tripid, logbook_entry_count, tid=tid)
 
 
-def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
+def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
     # This will need additional functions to replicate the persontrip calculation and storage. For the
     # moment we leave all that to be done in the django db
     global trips # should be a singleton TROG eventually
     global logdataissues
 
-    if tripid1 is None or tripid1 =="":
-       tid= "n{}-s{:02d}".format(str(date),seq)
-       #print(" - New id ",tid)
-    else:
-        tid= tripid1
-
     if tid in trips:
         tyear, tdate, *trest = trips[tid]
         msg = f"   ! DUPLICATE on {tdate} id: '{tid}'"
         print(msg)
         DataIssue.objects.create(parser='logbooks', message=msg)
-        tid= "d{}-s{:02d}".format(str(date),seq)
+        tid = set_trip_id(str(date),seq)
         #print("   - De-dup ",seq, tid)
         logdataissues[tid]=msg
     trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
@@ -247,7 +257,7 @@ def Parseloghtmltxt(year, expedition, txt):
     logbook_entry_count = 0
     for trippara in tripparas:
         logbook_entry_count += 1
-        tid= "n{}-s{:02d}".format(year,logbook_entry_count)
+        tid = set_trip_id(year,logbook_entry_count)
        
         s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
                             \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
@@ -285,7 +295,7 @@ def Parseloghtmltxt(year, expedition, txt):
                           entry_type="html", tid=tid)
 
         EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                            "html", tripid1, logbook_entry_count)
+                            "html", tripid1, logbook_entry_count, tid=tid)
 
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
@@ -298,18 +308,26 @@ def Parseloghtml01(year, expedition, txt):
     logbook_entry_count = 0
     for trippara in tripparas:
         logbook_entry_count += 1
-        tid= f"{year}.s{logbook_entry_count:02d}"
+        tid = set_trip_id(year,logbook_entry_count)
         try:
             s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
             if not s:
-                message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..."
+                message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
                 DataIssue.objects.create(parser='logbooks', message=message)
                 logdataissues[tid]=message
                 print(message)
                 break
             tripheader, triptext = s.group(1), s.group(2)
             mtripid = re.search(r'<a id="(.*?)"', tripheader)
+            # if not mtripid:
+                # # not an error, this is probabluy jusyt a different year
+                # message = f" ! - Fail id trip:{tid} header:'{tripheader}'" 
+                # DataIssue.objects.create(parser='logbooks', message=message)
+                # logdataissues[tid]=message
+                # print(message)
+                
             tripid = mtripid and mtripid.group(1) or ""
+            #print(f" # - mtripid: {mtripid}")
             tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
 
             tripdate, triptitle, trippeople = tripheader.split("|")
@@ -336,6 +354,13 @@ def Parseloghtml01(year, expedition, txt):
             ltriptext = re.sub(r"</?u>", "_", ltriptext)
             ltriptext = re.sub(r"</?i>", "''", ltriptext)
             ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+            
+            if ltriptext == "":
+                message = " ! - Zero content for logbook entry!: " + tid 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                
 
             entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                     trippeople, expedition, tu, "html01", tripid)
@@ -343,16 +368,16 @@ def Parseloghtml01(year, expedition, txt):
             try:
                 EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
                                   trippeople=trippeople, expedition=expedition, logtime_underground=0,
-                                  entry_type="html")
-
+                                  entry_type="html", tid=tid)
             except:
                 message = " ! - Enter log entry into database FAIL  exception in: " + tid 
                 DataIssue.objects.create(parser='logbooks', message=message)
                 logdataissues[tid]=message
                 print(message)
+                
             try:
                 EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                                    "html01", tripid, logbook_entry_count)
+                                    "html01", tripid, logbook_entry_count, tid=tid)
             except:
                 message = " ! - Enter log entry into ObjectStore FAIL  exception in: " + tid 
                 DataIssue.objects.create(parser='logbooks', message=message)
@@ -360,7 +385,7 @@ def Parseloghtml01(year, expedition, txt):
                 print(message)
                 
         except:
-            message = " ! - Skipping logentry due to exception in: " + tid 
+            message = f" ! - Skipping logentry {year} due to exception in: {tid}"
             DataIssue.objects.create(parser='logbooks', message=message)
             logdataissues[tid]=message
             print(message)
@@ -381,11 +406,11 @@ def Parseloghtml03(year, expedition, txt):
     logbook_entry_count = 0
     for trippara in tripparas:
         logbook_entry_count += 1
-        tid= f"{year}.s{logbook_entry_count:02d}"
+        tid = set_trip_id(year,logbook_entry_count)
         
         s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
         if not ( s ) :
-            message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
+            message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
             DataIssue.objects.create(parser='logbooks', message=message)
             logdataissues[tid]=message
             print(message)
@@ -419,10 +444,10 @@ def Parseloghtml03(year, expedition, txt):
 
         EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
                           text = ltriptext, trippeople=trippeople, expedition=expedition,
-                          logtime_underground=0, entry_type="html")
+                          logtime_underground=0, entry_type="html", tid=tid)
 
         EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                            "html03", tid, logbook_entry_count)
+                            "html03", tid, logbook_entry_count, tid=tid)
 
 
 def SetDatesFromLogbookEntries(expedition):
@@ -477,16 +502,15 @@ def LoadLogbookForExpedition(expedition, expect):
         for di in dataissues:
             ph = year
             if re.search(ph, di.message) is not None:
-                print(f'   - CLEANING dataissue {di.message}')
+                #print(f'   - CLEANING dataissue {di.message}')
                 di.delete()
      
-        print(f'   - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
+        #print(f'   - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
         dellist = []
         for key, value in logdataissues.items():
-            #  tripentry = year + "." + str(logbook_entry_count)
-            print(f'   - CLEAN [{key}]')
-            if key.startswith(year + "."):
-                print(f'   - CLEANING logdataissues [{key:12}]: value ')
+            #print(f'   - CLEANING logdataissues [{key}]: {value}')
+            if key.startswith(year):
+                #print(f'   - CLEANING logdataissues [{key:12}]: {value} ')
                 dellist.append(key)
         for i in dellist:
             del logdataissues[i]
@@ -547,7 +571,9 @@ def LoadLogbookForExpedition(expedition, expect):
 
     if logbook_parseable:
         parser = globals()[parsefunc]
-        parser(expedition.year, expedition, txt)
+        
+        parser(expedition.year, expedition, txt) # this launches the parser
+        
         SetDatesFromLogbookEntries(expedition)
         if len(logentries) >0:
             print("   - Cacheing " , len(logentries), " log entries")
@@ -625,112 +651,13 @@ def LoadLogbooks():
     odb.sync()
     odb.close()
 
-dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
-
-def parseAutoLogBookEntry(filename):
-    '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
-    which is then stored in a separate location to the usual logbook.html 
-    But when importing logbook.html all these individual entries also need ot be parsed.
-    
-    This is all redundant as we are getting rid of the whole individual trip entry system
-    '''
-    errors = []
-    f = open(filename, "r")
-    contents = f.read()
-    f.close()
-
-    dateMatch = dateRegex.search(contents)
-    if dateMatch:
-        year, month, day = [int(x) for x in dateMatch.groups()]
-        date = datetime.date(year, month, day)
-    else:
-        errors.append(" - Date could not be found")
-
-    expeditionYearMatch = expeditionYearRegex.search(contents)
-    if expeditionYearMatch:
-        try:
-            expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
-            personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
-        except Expedition.DoesNotExist:
-            errors.append(" - Expedition not in database")   
-    else:
-        errors.append(" - Expedition Year could not be parsed")   
-
-    titleMatch = titleRegex.search(contents)
-    if titleMatch:
-        title, = titleMatch.groups()
-        if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
-            errors.append(" - Title too long")   
-    else:
-        errors.append(" - Title could not be found") 
-
-    caveMatch = caveRegex.search(contents)
-    if caveMatch:
-        caveRef, = caveMatch.groups()
-        try:
-            # this is a slow and uncertain function:
-            cave = getCaveByReference(caveRef)
-        except:
-            cave = None
-            errors.append(" - Cave not found in database")   
-    else:
-        cave = None
-
-    locationMatch = locationRegex.search(contents)
-    if locationMatch:
-        location, = locationMatch.groups() 
-    else:
-        location = None
-       
-    if cave is None and location is None:
-        errors.append(" - Location nor cave could not be found") 
+# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
+# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
+# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
+# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
+# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
+# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
+# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
 
-    reportMatch = reportRegex.search(contents)
-    if reportMatch:
-        report, = reportMatch.groups()
-    else:
-        errors.append(" - Contents could not be found") 
-    if errors:
-        return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
-    people = []
-    for personMatch in personRegex.findall(contents):
-       nameAuthorMatch = nameAuthorRegex.search(contents)
-       if nameAuthorMatch:
-           author, name = nameAuthorMatch.groups()
-           if name.lower() in personExpeditionNameLookup:
-               personExpo = personExpeditionNameLookup[name.lower()]
-           else:
-               errors.append(" - Person could not be found in database")
-           author = bool(author)
-       else:
-           errors.append(" - Persons name could not be found")
-       
-       TUMatch = TURegex.search(contents)
-       if TUMatch:
-           TU, = TUMatch.groups()
-       else:
-           errors.append(" - TU could not be found")
-       if not errors:
-           people.append((name, author, TU))
-    if errors:
-        return errors # Bail out before committing to the database
-    logbookEntry = LogbookEntry(date = date, 
-                                       expedition  = expedition,
-                                       title = title, cave = cave, place = location, 
-                                       text = report, slug = slugify(title)[:50],
-                                       filename = filename)
-    logbookEntry.save()
-    for name, author, TU in people:
-        PersonTrip(personexpedition = personExpo,  
-                          time_underground = TU, 
-                          logbook_entry = logbookEntry, 
-                          is_logbook_entry_author = author).save()
-    print(logbookEntry)
-\ No newline at end of file