refactoring author checks

author: Philip Sargent <philip.sargent@gmail.com> 2023-01-28 10:47:25 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2023-01-28 10:47:25 +0000
commit: e4c804b30585da3ff300659ed364b00c65e850b7 (patch)
tree: c9ebd0124ac896f7a1053a00b98ed51bd7acd49b /parsers/logbooks.py
parent: e01bd39609ba14232544125cb78ecd3c2ba99ea7 (diff)
download: troggle-e4c804b30585da3ff300659ed364b00c65e850b7.tar.gz
troggle-e4c804b30585da3ff300659ed364b00c65e850b7.tar.bz2
troggle-e4c804b30585da3ff300659ed364b00c65e850b7.zip
1 files changed, 42 insertions, 31 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index a65774d..fd1e7eb 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -21,13 +21,20 @@ Parses and imports logbooks in all their wonderful confusion
     https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
-- refactor everything with some urgency, esp. parse_logbook_for_expedition()
+- Most of the time is during the database writing (13s out of 14s).
+
+- Move a lot of non-db code from store_entry_into_database()
+into parse_logbook_for_expedition()
+
+- call GetTripPersons at parsing time, not db writing time
+- this is a slow and uncertain function too:  cave = getCaveByReference(caveRef)
+
+- if I am certain that we are creating from scratch, don't use save_carefully() to
+create the Django objects. And I am, because I delete the outdated stuff.
 
 - pre-compile all the heavily used regular expressions !
 
-- break out the code that hits the database from that which parses the logbook
-so that the file-reading and parsing can be parallelized, while writing to the
-database remains serialized (sqlite is single-user).
+- refactor to get rid of the global 'logentries', very ugly indeed.
 
 - profile the code to find bad repetitive things, of which there are many.
 
@@ -35,13 +42,14 @@ database remains serialized (sqlite is single-user).
 
 - far too many uses of Django field dereferencing to get values, which is SLOW
 
+- replace explicit 1970 date with a constant EPOCH
+
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
   
 - We should ensure logbook.html is utf-8 and stop this crap:             
             file_in = open(logbookfile,'rb')
             txt = file_in.read().decode("latin1")
             
-- this is a slow and uncertain function:  cave = getCaveByReference(caveRef)
 
 - use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
 data for old logbooks? Not worth it..
@@ -162,17 +170,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
     # print(f" -  {tid}  [{author.person}] '{res[0][0].person}'...")
     return res, author
 
-
-def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
-    """saves a single logbook entry and related persontrips
-    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
-
-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
-
-    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
-    lookupAttribs={'date':date, 'title':title}
-    """
-
+def tidy_time_underground(logtime_underground):
     # Nasty hack, must tidy this up..
     if logtime_underground:
         try:
@@ -187,7 +185,9 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
                 logtime_underground = 0
     else:
         logtime_underground = 0
+    return logtime_underground
 
+def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
     try:
         trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
         # print(f" - {author} - {logtime_underground}")
@@ -195,14 +195,25 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
         message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
         DataIssue.objects.create(parser="logbooks", message=message)
         print(message)
-        raise
+        # raise
         return
 
     if not author:
         message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
         DataIssue.objects.create(parser="logbooks", message=message)
         print(message)
-        # return
+        
+    return trippersons, author
+        
+def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
+    """saves a single logbook entry and related persontrips
+    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
+
+    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
+
+    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
+    lookupAttribs={'date':date, 'title':title}
+    """
 
     # This needs attention. The slug field is derived from 'title'
     # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
@@ -257,9 +268,6 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
         # this creates the PersonTrip instance.
         save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
         
-    
-
-
 def parser_date(tripdate, year):
     """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
     dummydate = date(1970, 1, 1) # replace with _EPOCH
@@ -395,7 +403,10 @@ def parser_html(year, expedition, txt, seq=""):
         else:
             dupl[check] = 1
             
-        entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
+        tu = tidy_time_underground(tu)
+
+        trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
+        entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
         logentries.append(entrytuple)
 
 
@@ -439,7 +450,7 @@ def parser_blog(year, expedition, txt, sq=""):
     print(f"{len(tripheads)} - {len(tripparas)}")
 
     location = "Plateau"  # best guess, fix manually later
-    tu = 0
+    tu = 0 # no logged time underground in a blog entry
     logbook_entry_count = 0
     for i in range(0, len(tripparas)):
         tripstuff = tripparas[i]
@@ -493,7 +504,8 @@ def parser_blog(year, expedition, txt, sq=""):
         tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
         tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
 
-        entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
+        trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
+        entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
         logentries.append(entrytuple)
 
 def clean_all_logbooks():
@@ -564,7 +576,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
             print(f"   ! Very Bad Error opening {lb}")
 
         if logbook_parseable:
-
             # --------------------
             parser = globals()[parsefunc]
             print(f" - {year} parsing with {parsefunc} - {lb}")
@@ -572,8 +583,8 @@ def parse_logbook_for_expedition(expedition, blog=False):
             # --------------------
         # move database storage into separate step
         # for entrytuple in logentries:
-            # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-            # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+            # date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+            # store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
 
     if len(logentries) == expect:
         # print(f"OK  {year} {len(logentries):5d} is {expect}\n")
@@ -603,10 +614,10 @@ def LoadLogbook(year):
             f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
         )
     for entrytuple in logentries:
-        date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
         if expo == expedition:
             #print(f" - {triptitle}")
-            store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
     expedition.save() # to save logbook name property
     
 def LoadLogbooks():
@@ -692,8 +703,8 @@ def LoadLogbooks():
     # - LogBookEntry (text, who when etc.)
     # - PersonTrip (who was on that specific trip mentione din the logbook entry)
     for entrytuple in allentries:
-        date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-        store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
 
     for expo in expos: 
         expedition.save() # to save logbook name property
author	Philip Sargent <philip.sargent@gmail.com>	2023-01-28 10:47:25 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2023-01-28 10:47:25 +0000
commit	e4c804b30585da3ff300659ed364b00c65e850b7 (patch)
tree	c9ebd0124ac896f7a1053a00b98ed51bd7acd49b /parsers/logbooks.py
parent	e01bd39609ba14232544125cb78ecd3c2ba99ea7 (diff)
download	troggle-e4c804b30585da3ff300659ed364b00c65e850b7.tar.gz troggle-e4c804b30585da3ff300659ed364b00c65e850b7.tar.bz2 troggle-e4c804b30585da3ff300659ed364b00c65e850b7.zip