summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py76
1 files changed, 66 insertions, 10 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 4310fdb..46aba96 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -21,6 +21,39 @@ Parses and imports logbooks in all their wonderful confusion
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
'''
+todo='''
+- Put the object store 'trips' and the 'logdataissues' into TROG global object
+
+- refactor everything with some urgency, esp. LoadLogbookForExpedition()
+
+- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
+
+- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser,
+ or it is broken/incomplete and need hand-editing.
+
+- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
+ we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
+ volume of code here substantially.
+
+- edit LoadLogbooks() to use coroutines to speed up import substantially,
+ but perhaps we had better profile it first?
+
+- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
+
+- the object store will need additional functions to replicate the persontrip calculation
+ and storage. For the moment we leave all that to be done in the django db
+
+- We should ensure logbook.html is utf-8 and stop this crap:
+ file_in = open(logbookfile,'rb')
+ txt = file_in.read().decode("latin1")
+'''
+
+logentries = [] # the entire logbook for one year is a single object: a list of entries
+noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
+ 'base camp', 'basecamp', 'top camp', 'topcamp' ]
+logdataissues = {}
+trips ={}
+
#
# the logbook loading section
@@ -77,12 +110,6 @@ def GetTripCave(place):
return None
-logentries = [] # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
- 'base camp', 'basecamp', 'top camp', 'topcamp' ]
-logdataissues = {}
-trips ={}
-
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why?
@@ -205,8 +232,10 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
#print(" - New id ",tid)
else:
tid= tripid1
+
if tid in trips:
- msg = " ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
+ tyear, tdate, *trest = trips[tid]
+ msg = f" ! DUPLICATE on {tdate} id: '{tid}'"
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
tid= "d{}-s{:02d}".format(str(date),seq)
@@ -427,6 +456,7 @@ def LoadLogbookForExpedition(expedition, expect):
"""
# absolutely horrid. REFACTOR THIS (all my fault..)
global logentries
+ global logdataissues
logbook_parseable = False
logbook_cached = False
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
@@ -445,6 +475,26 @@ def LoadLogbookForExpedition(expedition, expect):
return False
return True
+ def cleanerrors(year):
+ global logdataissues
+ print(f' - CLEAN {year} {len(logdataissues)} data issues in total')
+ dataissues = DataIssue.objects.filter(parser='logbooks')
+ for di in dataissues:
+ ph = "t" + year + "-"
+ if re.search(ph, di.message) is not None:
+ print(f' - CLEANING dataissue {di.message}')
+ di.delete()
+
+ for te, content in logdataissues:
+ # tripentry = year + "." + str(logbook_entry_count)
+ print(f' - CLEAN {te}')
+ if te.startswith(year + "."):
+ print(f' - CLEANING logdataissue {te}')
+ logdataissues.pop(te)
+
+
+ cleanerrors(expedition.year)
+
if expedition.year in yearlinks:
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
expedition.logbookfile = yearlinks[expedition.year][0]
@@ -478,10 +528,10 @@ def LoadLogbookForExpedition(expedition, expect):
print(" -- Loaded ", len(logentries), " log entries")
logbook_cached = True
else:
- print(" !- Should be ", expect, " but ", len(logentries), " found in cache")
+ print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache")
raise
except:
- print(" ! Failed to load corrupt cache. Deleting it.")
+ print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.")
os.remove(cache_filename)
logentries=[]
raise
@@ -554,7 +604,7 @@ def LoadLogbooks():
TROG['pagecache']['expedition'][expo.year] = None # clear cache
if expo.year not in nologbook:
print((" - Logbook for: " + expo.year))
- numentries = LoadLogbookForExpedition(expo, entries[expo.year])
+ numentries = LoadLogbookForExpedition(expo, entries[expo.year]) # this actually loads the logbook for one year
log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
nlbe[expo.year]=numentries
expd[expo.year]= 0
@@ -588,6 +638,12 @@ locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
def parseAutoLogBookEntry(filename):
+ '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
+ which is then stored in a separate location to the usual logbook.html
+ But when importing logbook.html all these individual entries also need ot be parsed.
+
+ This is all redundant as we are getting rid of the whole individual trip entry system
+ '''
errors = []
f = open(filename, "r")
contents = f.read()