refactored, global removed

author: Philip Sargent <philip.sargent@gmail.com> 2023-01-28 13:14:54 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2023-01-28 13:14:54 +0000
commit: 9e71be8169e77ea71d080f3535ee0fce036cf838 (patch)
tree: 3aed973ffec5a4a9939983f16a5c9625e5d07d10
parent: db0504057b988ea0ccc982a53a48334084dc48bc (diff)
download: troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.gz
troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.bz2
troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.zip
2 files changed, 50 insertions, 74 deletions
diff --git a/core/utils.py b/core/utils.py
index 8bb7c2a..0b18e3b 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
         defined in core.models.TroggleModel.
         
         We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
+        
+        NOTE: this takes twice as long as simply creating a new object with the given values.
     
     """
     try:
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index f5250a2..bb592f3 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
 from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully, get_process_memory
+from troggle.core.utils import get_process_memory
 
 """
 Parses and imports logbooks in all their wonderful confusion
@@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
     https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
-- Most of the time is during the database writing (13s out of 14s).
+- Most of the time is during the database writing (6s out of 8s).
 
-- Move a lot of non-db code from store_entry_into_database()
-into parse_logbook_for_expedition()
-
-- call GetTripPersons at parsing time, not db writing time
 - this is a slow and uncertain function too:  cave = getCaveByReference(caveRef)
 
-- if I am certain that we are creating from scratch, don't use save_carefully() to
-create the Django objects. And I am, because I delete the outdated stuff.
-
 - pre-compile all the heavily used regular expressions !
 
-- refactor to get rid of the global 'logentries', very ugly indeed.
-
 - profile the code to find bad repetitive things, of which there are many.
 
 - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
 
-- far too many uses of Django field dereferencing to get values, which is SLOW
-
 - replace explicit 1970 date with a constant EPOCH
 
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
     text = text.replace("\t", "")
     text = text.replace("\n\n\n", "\n\n")
     return text
+
+def tidy_tid(tid, title):
+
+    if tid is not None:
+        return tid
+
+    # print(f"!    {title=} ")
+    tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")    
+    return tid
     
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
     """saves a single logbook entry and related persontrips
     Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
-
-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
-
-    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
-    lookupAttribs={'date':date, 'title':title}
     """
-    text = tidy_trip_image_urls(text, date)
     
-    # Check for an existing copy of the current entry, and save
-    expedition.get_expedition_day(date)
-
-    lookupAttribs = {"date": date, "title": title}
-    # but it is a db query which we should try to avoid - rewrite this
-
-    # This needs attention. The slug field is derived from 'title'
-    # NEW slug for a logbook entry here! Unique id + slugified title fragment
+    # gets the current ExpeditionDay, and saves it as an object attached to 
+    # the expedition, but does not attach it to the logbook entry. Why ?
+    
+    # expedition.get_expedition_day(date)
 
-    if tid is not None:
-        slug = tid
-        # slug = tid + "_" + slugify(title)[:10].replace('-','_')
-    else:
-        slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
     nonLookupAttribs = {
         "place": place,
         "text": text,
         "expedition": expedition,
         "time_underground": logtime_underground,
         "cave_slug": str(tripcave),
-        "slug": slug,
+        "slug": tid,
     }
-    # Rewriting as we know prior objects have already been deleted.
-    # This creates the lbo instance of LogbookEntry
+    lookupAttribs = {"date": date, "title": title}
     lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
-    # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
-
-    # for PersonTrip time_underground is float (decimal hours)
+    
     for tripperson, time_underground in trippersons:
-        # print(f" -  {tid} '{tripperson}' author:{tripperson == author}")
         lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
         nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
-        # this creates the PersonTrip instance.
         pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
-        # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
         
 def parser_date(tripdate, year):
     """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
     the endmatter up to the frontmatter. This made sense when translating
     from parser_html_01 format logfiles, believe me.
     """
-    global logentries
+    logentries = []
     dupl = {}
 
     # extract front material and stash for later use when rebuilding from list of entries
@@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
             place = triptitles[0]
         else:
             place = "UNKNOWN"
-        ltriptext = re.sub(r"</p>", "", triptext)
-        # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+        tripcontent = re.sub(r"</p>", "", triptext)
+        tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
 
         triptitle = triptitle.strip()
         # triptitle must be unique for a given date. We fix this here.
@@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
         tu = tidy_time_underground(tu)
         trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
         tripcave = tidy_trip_cave(place)
-    
-        entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
+        tid = tidy_tid(tid, triptitle)
+   
+        entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
         logentries.append(entrytuple)
+    return logentries
 
 
 def parser_blog(year, expedition, txt, sq=""):
@@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
     So the content is nested inside the header. Attachments (images) come after the content.
     It's a bugger, but it's out of our control.
     """
-    global logentries
+    logentries = []
 
     tripheads = re.findall(
         r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@@ -515,9 +494,12 @@ def parser_blog(year, expedition, txt, sq=""):
 
         trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
         tripcave = tidy_trip_cave(place)
-            
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
+        tid = tidy_tid(tid, triptitle)
+
         entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
         logentries.append(entrytuple)
+    return logentries
 
 def clean_all_logbooks():
     DataIssue.objects.filter(parser="logbooks").delete()
@@ -538,7 +520,6 @@ def clean_logbook_for_expedition(expedition):
 def parse_logbook_for_expedition(expedition, blog=False):
     """Parses all logbook entries for one expedition
     """
-    global logentries
     global ENTRIES
     logentries = [] 
     
@@ -590,7 +571,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
             # --------------------
             parser = globals()[parsefunc]
             print(f" - {year} parsing with {parsefunc} - {lb}")
-            parser(year, expedition, txt, sq)  # this launches the right parser for this year
+            logentries = parser(year, expedition, txt, sq)  # this launches the right parser
             # --------------------
 
     if len(logentries) == expect:
@@ -599,35 +580,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
     else:
         print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
 
-    return len(logentries)
+    return logentries
 
 
 def LoadLogbook(year):
     """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
     This is inside an atomic transaction"""
-    global logentries
-    nlbe = {}
 
     expo = Expedition.objects.get(year=year)
     year = expo.year  # some type funny
     clean_logbook_for_expedition(expo)
     logentries = []
     
-    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
+    logentries = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
     if year in BLOG_PARSER_SETTINGS:
-         nlbe[expo] = parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
+         logentries += parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
     else:
         print(
             f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
         )
     for entrytuple in logentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        if expo == expedition: # unneeded check, we zeroed it bbefore filling it
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+        if expo == expedition: # unneeded check, we zeroed it before filling it
             #print(f" - {triptitle}")
-            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
         else:
-            print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) 
-    expedition.save() # to save logbook name property
+            print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) 
+    expo.save() # to save logbook name property
     
 def LoadLogbooks():
     """This is the master function for parsing all logbooks into the Troggle database.
@@ -688,20 +667,15 @@ def LoadLogbooks():
             bloglist.append(expo)
 
     for ex in loglist:
-        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
+        logentries = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
         allentries += logentries
 
     for b in bloglist:
         print(f" - BLOG: {b}")
-        nlbe[b] += parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
+        logentries = parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
         allentries += logentries
 
-    yt = 0
-    for exp in nlbe:
-        yt += nlbe[exp]
-    print(f"total {yt:,} log entries parsed in all expeditions")
-
-    print(f"total {len(allentries):,} log entries in complete dict")
+    print(f"total {len(allentries):,} log entries parsed in all expeditions")
     mem = get_process_memory()
     print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
     duration = time.time() - start
@@ -712,11 +686,11 @@ def LoadLogbooks():
     # - LogBookEntry (text, who when etc.)
     # - PersonTrip (who was on that specific trip mentione din the logbook entry)
     for entrytuple in allentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
-
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+ 
     for expo in expos: 
-        expedition.save() # to save logbook name property
+        expo.save() # to save logbook name property
     mem = get_process_memory()
     print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
     duration = time.time() - start
author	Philip Sargent <philip.sargent@gmail.com>	2023-01-28 13:14:54 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2023-01-28 13:14:54 +0000
commit	9e71be8169e77ea71d080f3535ee0fce036cf838 (patch)
tree	3aed973ffec5a4a9939983f16a5c9625e5d07d10
parent	db0504057b988ea0ccc982a53a48334084dc48bc (diff)
download	troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.gz troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.bz2 troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.zip