diff options
author | Philip Sargent <philip.sargent@klebos.com> | 2021-04-23 03:07:21 +0100 |
---|---|---|
committer | Philip Sargent <philip.sargent@klebos.com> | 2021-04-23 03:07:21 +0100 |
commit | dbd186e299fecd8f10f3dca0a88b78f842b0c59b (patch) | |
tree | cf90218918c0896ad770bcceb69f1e7df0d8c097 /parsers/logbooks.py | |
parent | 1a4be0f02e8ca2536bb754c7285c005478ad047a (diff) | |
download | troggle-dbd186e299fecd8f10f3dca0a88b78f842b0c59b.tar.gz troggle-dbd186e299fecd8f10f3dca0a88b78f842b0c59b.tar.bz2 troggle-dbd186e299fecd8f10f3dca0a88b78f842b0c59b.zip |
make ?reload private and clean old error msgs
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 76 |
1 files changed, 66 insertions, 10 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 4310fdb..46aba96 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -21,6 +21,39 @@ Parses and imports logbooks in all their wonderful confusion # When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) ''' +todo=''' +- Put the object store 'trips' and the 'logdataissues' into TROG global object + +- refactor everything with some urgency, esp. LoadLogbookForExpedition() + +- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does + +- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, + or it is broken/incomplete and need hand-editing. + +- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that + we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the + volume of code here substantially. + +- edit LoadLogbooks() to use coroutines to speed up import substantially, + but perhaps we had better profile it first? + +- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. + +- the object store will need additional functions to replicate the persontrip calculation + and storage. For the moment we leave all that to be done in the django db + +- We should ensure logbook.html is utf-8 and stop this crap: + file_in = open(logbookfile,'rb') + txt = file_in.read().decode("latin1") +''' + +logentries = [] # the entire logbook for one year is a single object: a list of entries +noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', + 'base camp', 'basecamp', 'top camp', 'topcamp' ] +logdataissues = {} +trips ={} + # # the logbook loading section @@ -77,12 +110,6 @@ def GetTripCave(place): return None -logentries = [] # the entire logbook for one year is a single object: a list of entries -noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', - 'base camp', 'basecamp', 'top camp', 'topcamp' ] -logdataissues = {} -trips ={} - def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): """ saves a logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? @@ -205,8 +232,10 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, #print(" - New id ",tid) else: tid= tripid1 + if tid in trips: - msg = " ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1]) + tyear, tdate, *trest = trips[tid] + msg = f" ! DUPLICATE on {tdate} id: '{tid}'" print(msg) DataIssue.objects.create(parser='logbooks', message=msg) tid= "d{}-s{:02d}".format(str(date),seq) @@ -427,6 +456,7 @@ def LoadLogbookForExpedition(expedition, expect): """ # absolutely horrid. REFACTOR THIS (all my fault..) global logentries + global logdataissues logbook_parseable = False logbook_cached = False yearlinks = settings.LOGBOOK_PARSER_SETTINGS @@ -445,6 +475,26 @@ def LoadLogbookForExpedition(expedition, expect): return False return True + def cleanerrors(year): + global logdataissues + print(f' - CLEAN {year} {len(logdataissues)} data issues in total') + dataissues = DataIssue.objects.filter(parser='logbooks') + for di in dataissues: + ph = "t" + year + "-" + if re.search(ph, di.message) is not None: + print(f' - CLEANING dataissue {di.message}') + di.delete() + + for te, content in logdataissues: + # tripentry = year + "." + str(logbook_entry_count) + print(f' - CLEAN {te}') + if te.startswith(year + "."): + print(f' - CLEANING logdataissue {te}') + logdataissues.pop(te) + + + cleanerrors(expedition.year) + if expedition.year in yearlinks: logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0]) expedition.logbookfile = yearlinks[expedition.year][0] @@ -478,10 +528,10 @@ def LoadLogbookForExpedition(expedition, expect): print(" -- Loaded ", len(logentries), " log entries") logbook_cached = True else: - print(" !- Should be ", expect, " but ", len(logentries), " found in cache") + print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache") raise except: - print(" ! Failed to load corrupt cache. Deleting it.") + print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.") os.remove(cache_filename) logentries=[] raise @@ -554,7 +604,7 @@ def LoadLogbooks(): TROG['pagecache']['expedition'][expo.year] = None # clear cache if expo.year not in nologbook: print((" - Logbook for: " + expo.year)) - numentries = LoadLogbookForExpedition(expo, entries[expo.year]) + numentries = LoadLogbookForExpedition(expo, entries[expo.year]) # this actually loads the logbook for one year log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year])) nlbe[expo.year]=numentries expd[expo.year]= 0 @@ -588,6 +638,12 @@ locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S) caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S) def parseAutoLogBookEntry(filename): + '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip, + which is then stored in a separate location to the usual logbook.html + But when importing logbook.html all these individual entries also need ot be parsed. + + This is all redundant as we are getting rid of the whole individual trip entry system + ''' errors = [] f = open(filename, "r") contents = f.read() |