Make the logbook parser a little more sane

Move the parser to expo mapping to settings Set a default parser Iterate over the expo years rather than the mapping list!
author: Sam Wenham <sam@wenhams.co.uk> 2019-03-06 23:20:34 +0000
committer: Sam Wenham <sam@wenhams.co.uk> 2019-03-06 23:20:34 +0000
commit: 9fc80bed35a03295fad492601f802cab830144ae (patch)
tree: 0be95a5e56fe69e403c934151d90a301c92ddf0e /parsers
parent: 59f8647e0faaa37cc250db7dd2670fd7e0c4db5b (diff)
download: troggle-9fc80bed35a03295fad492601f802cab830144ae.tar.gz
troggle-9fc80bed35a03295fad492601f802cab830144ae.tar.bz2
troggle-9fc80bed35a03295fad492601f802cab830144ae.zip
2 files changed, 120 insertions, 97 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index cb40f58..4554b08 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -26,16 +26,16 @@ from utils import save_carefully
 def GetTripPersons(trippeople, expedition, logtime_underground):    
     res = [ ]
     author = None
-    for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
+    for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
         tripperson = tripperson.strip()
-        mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
+        mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
         if mul:
             tripperson = mul.group(1).strip()
         if tripperson and tripperson[0] != '*':
             #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
             personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
             if not personyear:
-                print "NoMatchFor: '%s'" % tripperson    
+                print("   - No name match for: '%s'" % tripperson)
             res.append((personyear, logtime_underground))
             if mul:
                 author = personyear
@@ -65,11 +65,11 @@ def GetTripCave(place):                     #need to be fuzzier about matching h
         return tripCaveRes
 
     elif len(tripCaveRes)>1:
-        print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
+        print("Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes))
         correctIndex=input("type list index of correct cave")
         return tripCaveRes[correctIndex]
     else:
-        print "No cave found for place " , place
+        print("No cave found for place " , place)
         return
 
 
@@ -78,7 +78,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
     """ saves a logbook entry and related persontrips """
     trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
     if not author:
-        print "skipping logentry", title
+        print("   - skipping logentry" + title + " no author for entry")
         return
     
 #    tripCave = GetTripCave(place)
@@ -102,8 +102,8 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
 
 def ParseDate(tripdate, year):
     """ Interprets dates in the expo logbooks and returns a correct datetime.date object  """
-    mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
-    mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+    mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+    mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
     if mdatestandard:
         assert mdatestandard.group(1) == year, (tripdate, year)
         year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
@@ -117,7 +117,7 @@ def ParseDate(tripdate, year):
 
 # 2007, 2008, 2006
 def Parselogwikitxt(year, expedition, txt):
-    trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+    trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
     for triphead, triptext in trippara:
         tripheadp = triphead.split("|")
         #print "ttt", tripheadp
@@ -126,7 +126,7 @@ def Parselogwikitxt(year, expedition, txt):
         tripsplace = tripplace.split(" - ")
         tripcave = tripsplace[0].strip()
 
-        tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+        tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
         if tul:
             #assert len(tul) <= 1, (triphead, triptext)
             #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
@@ -142,10 +142,14 @@ def Parselogwikitxt(year, expedition, txt):
 
 # 2002, 2004, 2005
 def Parseloghtmltxt(year, expedition, txt):
-    tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+    print(" - Using log html parser")
+    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+    logbook_entry_count = 0
     for trippara in tripparas:
+        #print(" - HR detected - maybe a trip?")
+        logbook_entry_count += 1
         
-        s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
+        s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
                             \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
                             \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
                             \s*<div\s+class="trippeople">\s*(.*?)</div>
@@ -155,38 +159,40 @@ def Parseloghtmltxt(year, expedition, txt):
                             \s*$
                      ''', trippara)
         if not s:
-            if not re.search("Rigging Guide", trippara):
-                print "can't parse: ", trippara  # this is 2007 which needs editing
+            if not re.search(r"Rigging Guide", trippara):
+                print("can't parse: ", trippara)  # this is 2007 which needs editing
             #assert s, trippara
             continue
 
         tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
         ldate = ParseDate(tripdate.strip(), year)
         #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
-        trippeople = re.sub("Ol(?!l)", "Olly", trippeople)        
-        trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)        
+        trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)        
+        trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)        
         triptitles = triptitle.split(" - ")
         if len(triptitles) >= 2:
             tripcave = triptitles[0]
         else:
             tripcave = "UNKNOWN"
         #print "\n", tripcave, "---   ppp", trippeople, len(triptext)
-        ltriptext = re.sub("</p>", "", triptext)
-        ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+        ltriptext = re.sub(r"</p>", "", triptext)
+        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
         EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+    if logbook_entry_count == 0:
+        print(" - No trip entrys found in logbook, check the syntax matches htmltxt format")
 
 
 # main parser for pre-2001.  simpler because the data has been hacked so much to fit it
 def Parseloghtml01(year, expedition, txt):
-    tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
     for trippara in tripparas:
         s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
         assert s, trippara[:300]
         tripheader, triptext = s.group(1), s.group(2)
-        mtripid = re.search('<a id="(.*?)"', tripheader)
+        mtripid = re.search(r'<a id="(.*?)"', tripheader)
         tripid = mtripid and mtripid.group(1) or ""
-        tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
+        tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
 
         #print "   ", [tripheader]
         #continue
@@ -194,7 +200,7 @@ def Parseloghtml01(year, expedition, txt):
         tripdate, triptitle, trippeople = tripheader.split("|")
         ldate = ParseDate(tripdate.strip(), year)
     
-        mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
+        mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
         if mtu:
             tu = mtu.group(1)
             triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
@@ -206,17 +212,17 @@ def Parseloghtml01(year, expedition, txt):
 
         ltriptext = triptext
         
-        mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+        mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
         if mtail:
             #print mtail.group(0)
             ltriptext = ltriptext[:mtail.start(0)]
-        ltriptext = re.sub("</p>", "", ltriptext)
-        ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
+        ltriptext = re.sub(r"</p>", "", ltriptext)
+        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
         #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
-        ltriptext = re.sub("</?u>", "_", ltriptext)
-        ltriptext = re.sub("</?i>", "''", ltriptext)
-        ltriptext = re.sub("</?b>", "'''", ltriptext)
+        ltriptext = re.sub(r"</?u>", "_", ltriptext)
+        ltriptext = re.sub(r"</?i>", "''", ltriptext)
+        ltriptext = re.sub(r"</?b>", "'''", ltriptext)
         
 
         #print ldate, trippeople.strip()
@@ -225,19 +231,19 @@ def Parseloghtml01(year, expedition, txt):
 
 
 def Parseloghtml03(year, expedition, txt):
-    tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
     for trippara in tripparas:
         s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
         assert s, trippara
         tripheader, triptext = s.group(1), s.group(2)
-        tripheader = re.sub("&nbsp;", " ", tripheader)
-        tripheader = re.sub("\s+", " ", tripheader).strip()
+        tripheader = re.sub(r"&nbsp;", " ", tripheader)
+        tripheader = re.sub(r"\s+", " ", tripheader).strip()
         sheader = tripheader.split(" -- ")
         tu = ""
         if re.match("T/U|Time underwater", sheader[-1]):
             tu = sheader.pop()
         if len(sheader) != 3:
-            print "header not three pieces", sheader
+            print("header not three pieces", sheader)
         tripdate, triptitle, trippeople = sheader
         ldate = ParseDate(tripdate.strip(), year)
         triptitles = triptitle.split(" , ")
@@ -246,37 +252,12 @@ def Parseloghtml03(year, expedition, txt):
         else:
             tripcave = "UNKNOWN"
         #print tripcave, "---   ppp", triptitle, trippeople, len(triptext)
-        ltriptext = re.sub("</p>", "", triptext)
-        ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
-        ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+        ltriptext = re.sub(r"</p>", "", triptext)
+        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
+        ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
         EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
 
-yearlinks = [ 
-#                ("2013", "2013/logbook.html", Parseloghtmltxt), 
-                ("2012", "2012/logbook.html", Parseloghtmltxt), 
-                ("2011", "2011/logbook.html", Parseloghtmltxt), 
-                ("2010", "2010/logbook.html", Parselogwikitxt), 
-                ("2009", "2009/2009logbook.txt", Parselogwikitxt), 
-                ("2008", "2008/2008logbook.txt", Parselogwikitxt), 
-                ("2007", "2007/logbook.html", Parseloghtmltxt), 
-                ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), 
-                ("2005", "2005/logbook.html", Parseloghtmltxt), 
-                ("2004", "2004/logbook.html", Parseloghtmltxt), 
-                ("2003", "2003/logbook.html", Parseloghtml03), 
-                ("2002", "2002/logbook.html", Parseloghtmltxt), 
-                ("2001", "2001/log.htm", Parseloghtml01), 
-                ("2000", "2000/log.htm", Parseloghtml01), 
-                ("1999", "1999/log.htm", Parseloghtml01), 
-                ("1998", "1998/log.htm", Parseloghtml01), 
-                ("1997", "1997/log.htm", Parseloghtml01), 
-                ("1996", "1996/log.htm", Parseloghtml01),
-                ("1995", "1995/log.htm", Parseloghtml01), 
-                ("1994", "1994/log.htm", Parseloghtml01), 
-                ("1993", "1993/log.htm", Parseloghtml01), 		
-                ("1992", "1992/log.htm", Parseloghtml01), 		
-                ("1991", "1991/log.htm", Parseloghtml01), 		
-            ]
 
 def SetDatesFromLogbookEntries(expedition):
     """
@@ -295,23 +276,41 @@ def SetDatesFromLogbookEntries(expedition):
             persontrip.persontrip_next = None
             lprevpersontrip = persontrip
             persontrip.save()
-            
-        
-        
+
+
 def LoadLogbookForExpedition(expedition):
     """ Parses all logbook entries for one expedition """
         
     expowebbase = os.path.join(settings.EXPOWEB, "years")  
     year = str(expedition.year)
-    for lyear, lloc, parsefunc in yearlinks:
-        if lyear == year:
-            break
-    fin = open(os.path.join(expowebbase, lloc))
-    print "opennning", lloc
-    txt = fin.read().decode("latin1")
-    fin.close()
-    parsefunc(year, expedition, txt)
-    SetDatesFromLogbookEntries(expedition)
+    yearlinks = settings.LOGBOOK_PARSER_SETTINGS
+
+    logbook_parseable = False
+
+    if expedition.year in yearlinks:
+        year_settings = yearlinks[expedition.year]
+        file_in = open(os.path.join(expowebbase, year_settings[0]))
+        txt = file_in.read().decode("latin1")
+        file_in.close()
+        parsefunc = year_settings[1]
+        logbook_parseable = True
+    else:
+        try:
+            file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
+            txt = file_in.read().decode("latin1")
+            file_in.close()
+            logbook_parseable = True
+            print("No set parser found using default")
+            parsefunc = settings.DEFAULT_LOGBOOK_PARSER
+        except (IOError):
+            logbook_parseable = False
+            print("Couldn't open default logbook file and nothing set for expo " + expo.year)
+
+    if logbook_parseable:
+        parser = globals()[parsefunc]
+        parser(expedition.year, expedition, txt)
+        SetDatesFromLogbookEntries(expedition)
+
     return "TOLOAD: " + year + "  " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + "  " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
 
 
@@ -324,25 +323,49 @@ def LoadLogbooks():
     #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
     #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
 
-    for year, lloc, parsefunc in yearlinks:
-        # This will not work until the corresponding year exists in the database. 
-        # In 2012 this needed noscript/folk.csv to be updated first.
-        expedition = models.Expedition.objects.filter(year = year)[0]
-        fin = open(os.path.join(expowebbase, lloc))
-        txt = fin.read().decode("latin1")
-        fin.close()
-        parsefunc(year, expedition, txt)
-        SetDatesFromLogbookEntries(expedition)
+    yearlinks = settings.LOGBOOK_PARSER_SETTINGS
 
-dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
+    expos = models.Expedition.objects.all()
+    for expo in expos:
+        print("\nLoading Logbook for: " + expo.year)
+
+        logbook_parseable = False
+
+        if expo.year in yearlinks:
+            #print(yearlinks[expo.year])
+            year_settings = yearlinks[expo.year]
+            file_in = open(os.path.join(expowebbase, year_settings[0]))
+            txt = file_in.read().decode("latin1")
+            file_in.close()
+            parsefunc = year_settings[1]
+            logbook_parseable = True
+        else:
+            try:
+                file_in = open(os.path.join(expowebbase, expo.year, settings.DEFAULT_LOGBOOK_FILE))
+                txt = file_in.read().decode("latin1")
+                file_in.close()
+                logbook_parseable = True
+                print("No set parser found using default")
+                parsefunc = settings.DEFAULT_LOGBOOK_PARSER
+            except (IOError):
+                logbook_parseable = False
+                print("Couldn't open default logbook file and nothing in settings for expo " + expo.year)
+
+        if logbook_parseable:
+            parser = globals()[parsefunc]
+            parser(expo.year, expo, txt)
+            SetDatesFromLogbookEntries(expo)
+
+
+dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
+titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
+reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
+personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
+nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
+TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
+caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
 
 def parseAutoLogBookEntry(filename):
     errors = []
@@ -435,4 +458,4 @@ def parseAutoLogBookEntry(filename):
                           time_underground = TU, 
                           logbook_entry = logbookEntry, 
                           is_logbook_entry_author = author).save()
-    print logbookEntry
+    print(logbookEntry)
diff --git a/parsers/people.py b/parsers/people.py
index bc18472..3c3fc03 100644
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -50,7 +50,7 @@ def LoadPersonsExpos():
     header = dict(zip(headers, range(len(headers))))
     
     # make expeditions
-    print "Loading expeditions"
+    print("Loading expeditions")
     years = headers[5:]
     
     for year in years:
@@ -61,7 +61,7 @@ def LoadPersonsExpos():
 
     
     # make persons
-    print "Loading personexpeditions"
+    print("Loading personexpeditions")
     #expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",")
     #expomissing = set(expoers2008)
 
@@ -127,7 +127,7 @@ def GetPersonExpeditionNameLookup(expedition):
     res = { }
     duplicates = set()
     
-    print "Calculating GetPersonExpeditionNameLookup for", expedition.year
+    print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
     personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
     for personexpedition in personexpeditions:
         possnames = [ ]
author	Sam Wenham <sam@wenhams.co.uk>	2019-03-06 23:20:34 +0000
committer	Sam Wenham <sam@wenhams.co.uk>	2019-03-06 23:20:34 +0000
commit	9fc80bed35a03295fad492601f802cab830144ae (patch)
tree	0be95a5e56fe69e403c934151d90a301c92ddf0e /parsers
parent	59f8647e0faaa37cc250db7dd2670fd7e0c4db5b (diff)
download	troggle-9fc80bed35a03295fad492601f802cab830144ae.tar.gz troggle-9fc80bed35a03295fad492601f802cab830144ae.tar.bz2 troggle-9fc80bed35a03295fad492601f802cab830144ae.zip