tidy and comments

author: Philip Sargent <philip.sargent@gmail.com> 2022-12-10 13:00:57 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2022-12-10 13:00:57 +0000
commit: 0e47909704cb8ebcb634f875964bd76fb36c4ec9 (patch)
tree: d650da912e3059f243e93a384b02a4f3b96ad008 /parsers/logbooks.py
parent: cabcada0b8738366bce33173ad1b3d376e8fb73c (diff)
download: troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.tar.gz
troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.tar.bz2
troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.zip
1 files changed, 48 insertions, 47 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 889387d..0bbc23d 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -93,7 +93,7 @@ entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015":
     "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
     "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
     "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
-    "1985": 24,"1984": 32,"1983": 52,"1982": 42,}
+    "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
 # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
 
 logentries = [] # the entire logbook for one year is a single object: a list of entries
@@ -471,57 +471,58 @@ def parser_html_01(year, expedition, txt):
                 print(message)
                 return
 
-# parser for 2003
-def parser_html_03(year, expedition, txt):
-    global logentries
-    global logdataissues
-
-    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
-    logbook_entry_count = 0
-    for trippara in tripparas:
-        logbook_entry_count += 1
-        tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
+# parser for 2003. Retired after conversion of the logbook.html
+# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
+# def parser_html_03(year, expedition, txt):
+    # global logentries
+    # global logdataissues
+
+    # tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+    # logbook_entry_count = 0
+    # for trippara in tripparas:
+        # logbook_entry_count += 1
+        # tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
         
-        s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
-        if not ( s ) :
-            message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
-            DataIssue.objects.create(parser='logbooks', message=message)
-            logdataissues[tid]=message
-            print(message)
-            break
+        # s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
+        # if not ( s ) :
+            # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
+            # DataIssue.objects.create(parser='logbooks', message=message)
+            # logdataissues[tid]=message
+            # print(message)
+            # break
         
-        tripheader, triptext = s.group(1), s.group(2)
-        tripheader = re.sub(r"&nbsp;", " ", tripheader)
-        tripheader = re.sub(r"\s+", " ", tripheader).strip()
-        sheader = tripheader.split(" -- ")
-        tu = ""
-        if re.match("T/U|Time underwater", sheader[-1]):
-            tu = sheader.pop() # not a number in 2003 usually
-            # print(f" -  {logbook_entry_count} '{tu}' ")
-        if len(sheader) != 3:
-            print(" ! Header not three pieces for parser_html_03() ", sheader)
-        tripdate, triptitle, trippeople = sheader
-        ldate = ParseDate(tripdate.strip(), year)
-        # print(f" -  {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
-        # print(f" -  {logbook_entry_count} '{trippeople}'  ")
-        titlelist = triptitle.split(" , ")
-        if len(titlelist) >= 2:
-            location, *namelist = titlelist # list unpacking operator
-            tripname = ", ".join(namelist) # concatenate strings
-            # print(f" -  {logbook_entry_count} {location}  '{tripname}'")
-        else:
-            location = "UNKNOWN"
+        # tripheader, triptext = s.group(1), s.group(2)
+        # tripheader = re.sub(r"&nbsp;", " ", tripheader)
+        # tripheader = re.sub(r"\s+", " ", tripheader).strip()
+        # sheader = tripheader.split(" -- ")
+        # tu = ""
+        # if re.match("T/U|Time underwater", sheader[-1]):
+            # tu = sheader.pop() # not a number in 2003 usually
+            # # print(f" -  {logbook_entry_count} '{tu}' ")
+        # if len(sheader) != 3:
+            # print(" ! Header not three pieces for parser_html_03() ", sheader)
+        # tripdate, triptitle, trippeople = sheader
+        # ldate = ParseDate(tripdate.strip(), year)
+        # # print(f" -  {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
+        # # print(f" -  {logbook_entry_count} '{trippeople}'  ")
+        # titlelist = triptitle.split(" , ")
+        # if len(titlelist) >= 2:
+            # location, *namelist = titlelist # list unpacking operator
+            # tripname = ", ".join(namelist) # concatenate strings
+            # # print(f" -  {logbook_entry_count} {location}  '{tripname}'")
+        # else:
+            # location = "UNKNOWN"
             
-        ltriptext = triptext + "<br /><br />\n\n" + tu
-        ltriptext = re.sub(r"</p>", "", ltriptext)
-        #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
-        #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+        # ltriptext = triptext + "<br /><br />\n\n" + tu
+        # ltriptext = re.sub(r"</p>", "", ltriptext)
+        # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        # ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
+        # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
 
 
-        entrytuple = (ldate, location, tripname, ltriptext, 
-                trippeople, expedition, tu, tid)
-        logentries.append(entrytuple)
+        # entrytuple = (ldate, location, tripname, ltriptext, 
+                # trippeople, expedition, tu, tid)
+        # logentries.append(entrytuple)
 
 
 def LoadLogbookForExpedition(expedition):
author	Philip Sargent <philip.sargent@gmail.com>	2022-12-10 13:00:57 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2022-12-10 13:00:57 +0000
commit	0e47909704cb8ebcb634f875964bd76fb36c4ec9 (patch)
tree	d650da912e3059f243e93a384b02a4f3b96ad008 /parsers/logbooks.py
parent	cabcada0b8738366bce33173ad1b3d376e8fb73c (diff)
download	troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.tar.gz troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.tar.bz2 troggle-0e47909704cb8ebcb634f875964bd76fb36c4ec9.zip