Fixed parsers

author: Philip Sargent <philip.sargent@gmail.com> 2023-09-02 17:49:37 +0300
committer: Philip Sargent <philip.sargent@gmail.com> 2023-09-02 17:49:37 +0300
commit: 1a8bc17f806d88b06aeabc28c11a6da199216fe2 (patch)
tree: f02f5038db4b02d78cf5fbf09bea91bb3a287966 /parsers/logbooks.py
parent: c9729c046ccccfd5e858f1fe8fbf24619d156e30 (diff)
download: troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.gz
troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.bz2
troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.zip
1 files changed, 41 insertions, 25 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index fdc68ad..ad92931 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -60,8 +60,8 @@ LOGBOOK_PARSER_SETTINGS = {
 LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
 
 ENTRIES = {
-    "2023": 81,
-    "2022": 93,
+    "2023": 83,
+    "2022": 94,
     "2019": 55,
     "2018": 95,
     "2017": 74,
@@ -127,7 +127,7 @@ def reset_trip_id(date):
     suffix = alphabet_suffix(n)
     
     tid = f"{date}{suffix}"
-    # print(tid)
+    # print(already, n, tid)
     return tid
 
 rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
@@ -136,6 +136,7 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
 def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
     res = []
     author = None
+    guests = []
     # print(f'# {tid}')
     # print(f" -  {tid} '{trippeople}'  ")
 
@@ -154,11 +155,12 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                 try:
                     personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
                     if not personyear:
-                        if known_foreigner(tripperson):
-                            message = f" ! - {expedition.year} Known foreigner: '{tripperson}' in entry {tid=}"
+                        guests.append(nickname_used)
+                        if known_foreigner(nickname_used):
+                            message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
                             print(message)
                         else:
-                            message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this year."
+                            message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
                             print(message)
                             DataIssue.objects.create(parser="logbooks", message=message)
                     res.append((personyear, nickname_used, logtime_underground))
@@ -170,10 +172,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                     raise
                 if author_u:
                     author = personyear
-            else:
-                # a person but with * prefix. Ignored everywhere.
-                # print(f" ! - {expedition.year} * person : {tripperson}")
-                pass
+            else: # *guest     
+                guests.append(tripperson)
+                # print(f" ! - {expedition.year} * GUEST : {tripperson}")
                 
     if not author:
         if not res:
@@ -181,7 +182,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
         author = res[-1][0]  # the previous valid person and a time of 0 hours
 
     # print(f" -  {tid}  [{author.person}] '{res[0][0].person}'...")
-    return res, author
+    return res, author, guests
 
 def tidy_time_underground(logtime_underground):
     # Nasty hack, must tidy this up..
@@ -202,7 +203,7 @@ def tidy_time_underground(logtime_underground):
 
 def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
     try:
-        trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
+        trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
         # trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
     except:
         message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
@@ -216,7 +217,7 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
         DataIssue.objects.create(parser="logbooks", message=message)
         print(message)
         
-    return trippersons, author
+    return trippersons, author, guests
     
 def tidy_trip_cave(place):
     #  GetCaveLookup() need to work  better. None of this data is *used* though?
@@ -251,16 +252,18 @@ def tidy_tid(tid, title):
     tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")    
     return tid
     
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid):
     """saves a single logbook entry and related personlogentry items
     
     We could do a bulk update to save all the entries, but then we would need to do a query on
-    each one to get the primary key to asign to the PersonLogEntries. So overall probably not much
+    each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
     faster ? 
     """
+    other_people = ", ".join(guests) # join list members separated by comma
     
     nonLookupAttribs = {
         "place": place,
+        "other_people": other_people, # *Ol's Mum, foreigners..
         "text": text,
         "expedition": expedition,
         "time_underground": logtime_underground,
@@ -324,6 +327,17 @@ def parser_date(tripdate, year):
 def parser_html(year, expedition, txt, seq=""):
     """This uses some of the more obscure capabilities of regular expressions,
     see https://docs.python.org/3/library/re.html
+    
+    e.g.
+    * is greedy
+    *? is non-greedy
+    
+    (?x) flag means VERBOSE
+    
+    (?: ) non-capturing parentheses
+    
+    \s whitespace
+    \S NOT whitespace
 
     You can't see it here, but a round-trip export-then-import will move
     the endmatter up to the frontmatter. This made sense when translating
@@ -357,7 +371,7 @@ def parser_html(year, expedition, txt, seq=""):
     for trippara in tripparas:
         logbook_entry_count += 1
         tid = set_trip_seq_id(year, logbook_entry_count)
-        # print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
+        # print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
 
         s = re.match(
             r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
@@ -367,15 +381,17 @@ def parser_html(year, expedition, txt, seq=""):
                             \s*<div\s+class="triptitle">\s*(.*?)</div>
                             ([\s\S]*?)
                             \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
+                            \s*(?:<div\s+class="editentry"\s*.*?</div>)?
                             \s*$
                      """,
             trippara,
         )
         if s:
             tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
+            # print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
         else:  
             # if not re.search(r"Rigging Guide", trippara):
-            msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
+            msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
             print(msg)
             DataIssue.objects.create(parser="logbooks", message=msg)
             continue
@@ -403,12 +419,12 @@ def parser_html(year, expedition, txt, seq=""):
             dupl[check] = 1
             
         tu = tidy_time_underground(tu)
-        trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
+        trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
         tripcave = tidy_trip_cave(place)
         tripcontent = tidy_trip_image_urls(tripcontent, ldate)
         tid = tidy_tid(tid, triptitle)
    
-        entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+        entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
         logentries.append(entrytuple)
     return logentries
 
@@ -509,13 +525,13 @@ def parser_blog(year, expedition, txt, sq=""):
         tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
 
         logtime_underground = 0
-        trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
+        trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
         # print(f" -  author: {author}")
         tripcave = tidy_trip_cave(place)
         tripcontent = tidy_trip_image_urls(tripcontent, year)
         tid = tidy_tid(tid, triptitle)
 
-        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
         logentries.append(entrytuple)
     return logentries
 
@@ -621,10 +637,10 @@ def LoadLogbook(year):
             f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
         )
     for entrytuple in logentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+        date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
         if expo == expedition: # unneeded check, we zeroed it before filling it
             # print(f" -- {triptitle}")
-            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
         else:
             print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) 
     expo.save() # to save logbook name property
@@ -708,8 +724,8 @@ def LoadLogbooks():
     # - LogBookEntry (text, who when etc.)
     # - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
     for entrytuple in allentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
-        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+        date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
  
     for expo in expos: 
         expo.save() # to save logbook name property
author	Philip Sargent <philip.sargent@gmail.com>	2023-09-02 17:49:37 +0300
committer	Philip Sargent <philip.sargent@gmail.com>	2023-09-02 17:49:37 +0300
commit	1a8bc17f806d88b06aeabc28c11a6da199216fe2 (patch)
tree	f02f5038db4b02d78cf5fbf09bea91bb3a287966 /parsers/logbooks.py
parent	c9729c046ccccfd5e858f1fe8fbf24619d156e30 (diff)
download	troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.gz troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.bz2 troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.zip