summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@klebos.com>2022-08-30 17:58:49 +0300
committerPhilip Sargent <philip.sargent@klebos.com>2022-08-30 17:58:49 +0300
commit0853bbdd19f93ae5e4d7615843e99380d6dab437 (patch)
treec565198ae653fb0521682b8299480239015bbf02 /parsers/logbooks.py
parent6daa96b69e40ef6d4709d6550428048ce5de4ac2 (diff)
downloadtroggle-0853bbdd19f93ae5e4d7615843e99380d6dab437.tar.gz
troggle-0853bbdd19f93ae5e4d7615843e99380d6dab437.tar.bz2
troggle-0853bbdd19f93ae5e4d7615843e99380d6dab437.zip
Many fixes and speedups
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py129
1 files changed, 68 insertions, 61 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 41d0895..40311b6 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -27,33 +27,36 @@ An idea which no longer seems sensible given that we rely on the database to do
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
'''
todo='''
-- Put the object store 'trips' and the 'logdataissues' into TROG global object
-
- Use the .shelve.db cache for all logbooks, not just individually
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
--- far too many uses of Django field dereferencing to get values, which is SLOW
+- profile the code to find bad repetitive things, of which there are many.
+
+- far too many uses of Django field dereferencing to get values, which is SLOW
-- Loogbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
+- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
- we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
+ we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
volume of code here substantially.
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
-
-- the object store will need additional functions to replicate the persontrip calculation
- and storage. For the moment we leave all that to be done in the django db
- Concurrent synchronisation would be nice..
-
-- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
+
+- the object store will need additional functions to replicate the persontrip calculation
+ and storage. For the moment we leave all that to be done in the django db
+ Concurrent synchronisation would be nice..
+
+- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
+
+- Put the object store 'trips' and the 'logdataissues' into TROG global object
+
'''
logentries = [] # the entire logbook for one year is a single object: a list of entries
@@ -62,7 +65,7 @@ noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plate
logdataissues = TROG['issues']['logdataissues']
trips ={}
-entries = { "2022": 42, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
+entries = { "2022": 62, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
@@ -101,7 +104,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
- message = f" ! - {expedition.year} No name match for: '{tripperson}' "
+ message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
print(message)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
@@ -115,7 +118,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return res, author
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
@@ -132,7 +135,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
- message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year "
+ message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
@@ -163,9 +166,9 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
slug = tid + "_" + slugify(title)[:10].replace('-','_')
else:
slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_')
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
+ nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
- # This cretes the lbo instance of LogbookEntry
+ # This creates the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
@@ -253,14 +256,14 @@ def Parselogwikitxt(year, expedition, txt):
trippeople, expedition, tu, "wiki", tripid)
logentries.append(entrytuple)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
- expedition=expedition, logtime_underground=0, tid=tid)
+ # EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
+ # expedition=expedition, logtime_underground=0, tid=tid)
- EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
- tu, "wiki", tripid, logbook_entry_count, tid=tid)
+ # EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
+ # tu, "wiki", tripid, logbook_entry_count, tid=tid)
-def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
+def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, tripid1, seq, tid=None):
'''Called once for each logbook entry as the logbook is parsed
'''
# This will need additional functions to replicate the persontrip calculation and storage. For the
@@ -280,7 +283,7 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
if not tid:
tid = set_trip_id(str(date),seq)
- trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
+ trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu)
## copy a lot of checking functionality here from EnterLogIntoDbase()
# GetTripPersons is a db query, so this will need to be put in ObjStore before this will work..
@@ -353,12 +356,12 @@ def Parseloghtmltxt(year, expedition, txt):
trippeople, expedition, tu, "html", tripid1)
logentries.append(entrytuple)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
- trippeople=trippeople, expedition=expedition, logtime_underground=0,
- entry_type="html", tid=tid)
+ # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
+ # trippeople=trippeople, expedition=expedition, logtime_underground=0,
+ # entry_type="html", tid=tid)
- EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html", tripid1, logbook_entry_count, tid=tid)
+ # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
+ # "html", tripid1, logbook_entry_count, tid=tid)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
@@ -455,24 +458,24 @@ def Parseloghtml01(year, expedition, txt):
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tid)
logentries.append(entrytuple)
- try:
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
- trippeople=trippeople, expedition=expedition, logtime_underground=0,
- entry_type="html", tid=tid)
- except:
- message = " ! - Enter log entry into database FAIL exception in: " + tid
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tid]=message
- print(message)
+ # try:
+ # EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
+ # trippeople=trippeople, expedition=expedition, logtime_underground=0,
+ # entry_type="html", tid=tid)
+ # except:
+ # message = " ! - Enter log entry into database FAIL exception in: " + tid
+ # DataIssue.objects.create(parser='logbooks', message=message)
+ # logdataissues[tid]=message
+ # print(message)
- try:
- EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html01", tid, logbook_entry_count, tid=tid)
- except:
- message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tid]=message
- print(message)
+ # try:
+ # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
+ # "html01", tid, logbook_entry_count, tid=tid)
+ # except:
+ # message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
+ # DataIssue.objects.create(parser='logbooks', message=message)
+ # logdataissues[tid]=message
+ # print(message)
except:
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
@@ -514,7 +517,7 @@ def Parseloghtml03(year, expedition, txt):
if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop()
if len(sheader) != 3:
- print((" ! Header not three pieces", sheader))
+ print(" ! Header not three pieces", sheader)
tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ")
@@ -532,12 +535,12 @@ def Parseloghtml03(year, expedition, txt):
trippeople, expedition, tu, "html03", tid)
logentries.append(entrytuple)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
- text = ltriptext, trippeople=trippeople, expedition=expedition,
- logtime_underground=0, entry_type="html", tid=tid)
+ # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
+ # text = ltriptext, trippeople=trippeople, expedition=expedition,
+ # logtime_underground=0, entry_type="html", tid=tid)
- EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html03", tid, logbook_entry_count, tid=tid)
+ # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
+ # "html03", tid, logbook_entry_count, tid=tid)
def SetDatesFromLogbookEntries(expedition):
@@ -618,7 +621,7 @@ def LoadLogbookForExpedition(expedition):
expedition.logbookfile = yearlinks[year][0]
parsefunc = yearlinks[year][1]
else:
- logbookpath = os.path.join(expologbase, year, settings.DEFAULT_LOGBOOK_FILE)
+ logbookpath = Path(expologbase) / year / settings.DEFAULT_LOGBOOK_FILE
expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
cache_filename = Path(str(logbookpath) + ".cache")
@@ -639,13 +642,13 @@ def LoadLogbookForExpedition(expedition):
print(" - ! Cache is > 30 days old")
bad_cache= True
if bad_cache:
- print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
+ print(" - so cache is either stale or more than 30 days old. Deleting it.")
os.remove(cache_filename)
logentries=[]
- print(" ! Removed stale or corrupt cache file")
+ print(" - Deleted stale or corrupt cache file")
raise
- # print(" - Reading cache: " + str(cache_filename), end='')
try:
+ # print(" - Reading cache: " + str(cache_filename), end='')
with open(cache_filename, "rb") as f:
year,n,logentries = pickle.load(f)
if validcache(year,n):
@@ -660,21 +663,22 @@ def LoadLogbookForExpedition(expedition):
logentries=[]
raise
except :
- print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"")
+ print(" - Cache old or de-pickle failure \"" + str(cache_filename) +"\"")
try:
file_in = open(logbookpath,'rb')
- txt = file_in.read().decode("latin1")
+ txt = file_in.read().decode("utf-8")
file_in.close()
logbook_parseable = True
except (IOError):
logbook_parseable = False
- print((" ! Couldn't open logbook " + logbookpath))
+ print(" ! Couldn't open logbook as UTF-8 " + logbookpath)
if logbook_parseable:
parser = globals()[parsefunc]
print(f' - Using parser {parsefunc}')
parser(year, expedition, txt) # this launches the right parser for this year
+ print(" - Setting dates from logbook entries")
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
@@ -686,11 +690,14 @@ def LoadLogbookForExpedition(expedition):
i=0
for entrytuple in logentries:
- date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
+ try:
+ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+ except ValueError: # cope with removal of entry_type but still in cache files. Remove in Sept. 2022.
+ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
- entry_type, tripid1)
+ tripid1)
EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, logtime_underground,
- entry_type, tripid1, i)
+ tripid1, i)
i +=1
SetDatesFromLogbookEntries(expedition)