summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@gmail.com>2023-01-28 13:14:54 +0000
committerPhilip Sargent <philip.sargent@gmail.com>2023-01-28 13:14:54 +0000
commit9e71be8169e77ea71d080f3535ee0fce036cf838 (patch)
tree3aed973ffec5a4a9939983f16a5c9625e5d07d10
parentdb0504057b988ea0ccc982a53a48334084dc48bc (diff)
downloadtroggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.gz
troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.tar.bz2
troggle-9e71be8169e77ea71d080f3535ee0fce036cf838.zip
refactored, global removed
-rw-r--r--core/utils.py2
-rw-r--r--parsers/logbooks.py122
2 files changed, 50 insertions, 74 deletions
diff --git a/core/utils.py b/core/utils.py
index 8bb7c2a..0b18e3b 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
defined in core.models.TroggleModel.
We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
+
+ NOTE: this takes twice as long as simply creating a new object with the given values.
"""
try:
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index f5250a2..bb592f3 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully, get_process_memory
+from troggle.core.utils import get_process_memory
"""
Parses and imports logbooks in all their wonderful confusion
@@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
https://expo.survex.com/handbook/computing/logbooks-parsing.html
"""
todo = """
-- Most of the time is during the database writing (13s out of 14s).
+- Most of the time is during the database writing (6s out of 8s).
-- Move a lot of non-db code from store_entry_into_database()
-into parse_logbook_for_expedition()
-
-- call GetTripPersons at parsing time, not db writing time
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
-- if I am certain that we are creating from scratch, don't use save_carefully() to
-create the Django objects. And I am, because I delete the outdated stuff.
-
- pre-compile all the heavily used regular expressions !
-- refactor to get rid of the global 'logentries', very ugly indeed.
-
- profile the code to find bad repetitive things, of which there are many.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
-- far too many uses of Django field dereferencing to get values, which is SLOW
-
- replace explicit 1970 date with a constant EPOCH
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
+
+def tidy_tid(tid, title):
+
+ if tid is not None:
+ return tid
+
+ # print(f"! {title=} ")
+ tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
+ return tid
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
-
- troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
-
- Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
- lookupAttribs={'date':date, 'title':title}
"""
- text = tidy_trip_image_urls(text, date)
- # Check for an existing copy of the current entry, and save
- expedition.get_expedition_day(date)
-
- lookupAttribs = {"date": date, "title": title}
- # but it is a db query which we should try to avoid - rewrite this
-
- # This needs attention. The slug field is derived from 'title'
- # NEW slug for a logbook entry here! Unique id + slugified title fragment
+ # gets the current ExpeditionDay, and saves it as an object attached to
+ # the expedition, but does not attach it to the logbook entry. Why ?
+
+ # expedition.get_expedition_day(date)
- if tid is not None:
- slug = tid
- # slug = tid + "_" + slugify(title)[:10].replace('-','_')
- else:
- slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
nonLookupAttribs = {
"place": place,
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
- "slug": slug,
+ "slug": tid,
}
- # Rewriting as we know prior objects have already been deleted.
- # This creates the lbo instance of LogbookEntry
+ lookupAttribs = {"date": date, "title": title}
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
- # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
-
- # for PersonTrip time_underground is float (decimal hours)
+
for tripperson, time_underground in trippersons:
- # print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
- # this creates the PersonTrip instance.
pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
- # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
the endmatter up to the frontmatter. This made sense when translating
from parser_html_01 format logfiles, believe me.
"""
- global logentries
+ logentries = []
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
@@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
place = triptitles[0]
else:
place = "UNKNOWN"
- ltriptext = re.sub(r"</p>", "", triptext)
- # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+ tripcontent = re.sub(r"</p>", "", triptext)
+ tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
triptitle = triptitle.strip()
# triptitle must be unique for a given date. We fix this here.
@@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
-
- entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+ tripcontent = tidy_trip_image_urls(tripcontent, date)
+ tid = tidy_tid(tid, triptitle)
+
+ entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
+ return logentries
def parser_blog(year, expedition, txt, sq=""):
@@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
"""
- global logentries
+ logentries = []
tripheads = re.findall(
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@@ -515,9 +494,12 @@ def parser_blog(year, expedition, txt, sq=""):
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
tripcave = tidy_trip_cave(place)
-
+ tripcontent = tidy_trip_image_urls(tripcontent, date)
+ tid = tidy_tid(tid, triptitle)
+
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
+ return logentries
def clean_all_logbooks():
DataIssue.objects.filter(parser="logbooks").delete()
@@ -538,7 +520,6 @@ def clean_logbook_for_expedition(expedition):
def parse_logbook_for_expedition(expedition, blog=False):
"""Parses all logbook entries for one expedition
"""
- global logentries
global ENTRIES
logentries = []
@@ -590,7 +571,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
# --------------------
parser = globals()[parsefunc]
print(f" - {year} parsing with {parsefunc} - {lb}")
- parser(year, expedition, txt, sq) # this launches the right parser for this year
+ logentries = parser(year, expedition, txt, sq) # this launches the right parser
# --------------------
if len(logentries) == expect:
@@ -599,35 +580,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
else:
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
- return len(logentries)
+ return logentries
def LoadLogbook(year):
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
This is inside an atomic transaction"""
- global logentries
- nlbe = {}
expo = Expedition.objects.get(year=year)
year = expo.year # some type funny
clean_logbook_for_expedition(expo)
logentries = []
- nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
+ logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
if year in BLOG_PARSER_SETTINGS:
- nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
+ logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
else:
print(
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entrytuple in logentries:
- date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
- if expo == expedition: # unneeded check, we zeroed it bbefore filling it
+ date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+ if expo == expedition: # unneeded check, we zeroed it before filling it
#print(f" - {triptitle}")
- store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+ store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
else:
- print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" )
- expedition.save() # to save logbook name property
+ print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
+ expo.save() # to save logbook name property
def LoadLogbooks():
"""This is the master function for parsing all logbooks into the Troggle database.
@@ -688,20 +667,15 @@ def LoadLogbooks():
bloglist.append(expo)
for ex in loglist:
- nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
+ logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
allentries += logentries
for b in bloglist:
print(f" - BLOG: {b}")
- nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
+ logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
allentries += logentries
- yt = 0
- for exp in nlbe:
- yt += nlbe[exp]
- print(f"total {yt:,} log entries parsed in all expeditions")
-
- print(f"total {len(allentries):,} log entries in complete dict")
+ print(f"total {len(allentries):,} log entries parsed in all expeditions")
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
@@ -712,11 +686,11 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
- date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
- store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
-
+ date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+ store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+
for expo in expos:
- expedition.save() # to save logbook name property
+ expo.save() # to save logbook name property
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start