From be410d4d9ddb01c780f5998ddf0fa42fd036cecb Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Thu, 24 Mar 2022 01:05:50 +0000 Subject: minor refactoring --- parsers/logbooks.py | 117 +++++++++++++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 51 deletions(-) (limited to 'parsers/logbooks.py') diff --git a/parsers/logbooks.py b/parsers/logbooks.py index e91c69e..0a3443a 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -31,6 +31,8 @@ todo=''' - refactor everything with some urgency, esp. LoadLogbookForExpedition() +-- far too many uses of Django field dereferencing to get values, which is SLOW + - Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or it is broken/incomplete and need hand-editing. @@ -61,6 +63,14 @@ noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plate logdataissues = TROG['issues']['logdataissues'] trips ={} +entries = { "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, + "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, + "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, + "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, + "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, + "1985": 24,"1984": 32,"1983": 52,"1982": 42,} +# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. + # # the logbook loading section @@ -534,19 +544,25 @@ def SetDatesFromLogbookEntries(expedition): #persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import. -def LoadLogbookForExpedition(expedition, expect): +def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition If a cache is found it uses it. If not found, or fails sanity checks, parses source file. """ # absolutely horrid. REFACTOR THIS (all my fault..) global logentries global logdataissues + global entries + logbook_parseable = False logbook_cached = False yearlinks = settings.LOGBOOK_PARSER_SETTINGS expologbase = os.path.join(settings.EXPOWEB, "years") logentries=[] + year = expedition.year + expect = entries[year] + # print(" - Logbook for: " + year) + def validcache(year,n): if year != expedition: print(" ! year != expedition ",year, expedition ) @@ -578,14 +594,14 @@ def LoadLogbookForExpedition(expedition, expect): for i in dellist: del logdataissues[i] - cleanerrors(expedition.year) + cleanerrors(year) - if expedition.year in yearlinks: - logbookpath = Path(expologbase) / expedition.year / yearlinks[expedition.year][0] - expedition.logbookfile = yearlinks[expedition.year][0] - parsefunc = yearlinks[expedition.year][1] + if year in yearlinks: + logbookpath = Path(expologbase) / year / yearlinks[year][0] + expedition.logbookfile = yearlinks[year][0] + parsefunc = yearlinks[year][1] else: - logbookpath = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE) + logbookpath = os.path.join(expologbase, year, settings.DEFAULT_LOGBOOK_FILE) expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE parsefunc = settings.DEFAULT_LOGBOOK_PARSER cache_filename = Path(str(logbookpath) + ".cache") @@ -616,7 +632,7 @@ def LoadLogbookForExpedition(expedition, expect): with open(cache_filename, "rb") as f: year,n,logentries = pickle.load(f) if validcache(year,n): - print(" -- Loaded ", len(logentries), " log entries") + print(f" -- {year} : Loaded {len(logentries)} log entries") logbook_cached = True else: print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache") @@ -640,7 +656,7 @@ def LoadLogbookForExpedition(expedition, expect): if logbook_parseable: parser = globals()[parsefunc] print(f' - Using parser {parsefunc}') - parser(expedition.year, expedition, txt) # this launches the right parser for this year + parser(year, expedition, txt) # this launches the right parser for this year SetDatesFromLogbookEntries(expedition) if len(logentries) >0: @@ -656,10 +672,17 @@ def LoadLogbookForExpedition(expedition, expect): date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, entry_type, tripid1) - EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground, + EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, logtime_underground, entry_type, tripid1, i) i +=1 SetDatesFromLogbookEntries(expedition) + + if len(logentries) == expect: + # print(f"OK {year} {len(logentries):5d} is {expect}\n") + pass + else: + print(f"BAD {year} {len(logentries):5d} is not {expect}\n") + return len(logentries) def LoadLogbooks(): @@ -668,6 +691,7 @@ def LoadLogbooks(): This should be rewritten to use coroutines to load all logbooks from disc in parallel. """ global logdataissues + global entries logdataissues = {} DataIssue.objects.filter(parser='logbooks').delete() @@ -682,51 +706,42 @@ def LoadLogbooks(): lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"] sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first] nologbook = noexpo + lostlogbook + sqlfail - entries = { "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, - "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, - "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, - "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, - "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, - "1985": 24,"1984": 32,"1983": 52,"1982": 42,} - # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. - try: - os.remove("loadlogbk.log") - except OSError: - pass + nlbe={} expd ={} - with open("loadlogbk.log", "a") as log: - for expo in expos: - TROG['pagecache']['expedition'][expo.year] = None # clear cache - if expo.year in sqlfail: - print(" - Logbook for: " + expo.year + " NO parsing attempted - known sql failures") - message = f" ! - Not even attempting to parse logbook for {expo.year} until code fixed" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[f"sqlfail {expo.year}"]=message - print(message) + actuals = [] + + for expo in expos: + year = expo.year + TROG['pagecache']['expedition'][year] = None # clear cache + if year in sqlfail: + print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") + message = f" ! - Not even attempting to parse logbook for {year} until code fixed" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[f"sqlfail {year}"]=message + print(message) - if expo.year not in nologbook: - print((" - Logbook for: " + expo.year)) - if expo.year in entries: - numentries = LoadLogbookForExpedition(expo, entries[expo.year]) # this actually loads the logbook for one year - log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year])) - nlbe[expo.year]=numentries - expd[expo.year]= 0 - else: - print(" - No Logbook yet for: " + expo.year) # catch case when preparing for next expo - print("** total trips in ObjStore:", len(trips)) - #for i in logdataissues: - # print("{:15s}: {}".format(i, logdataissues[i])) + if year not in nologbook: + if year in entries: + actuals.append(expo) + else: + print(" - No Logbook yet for: " + year) # catch case when preparing for next expo + + for ex in actuals: + nlbe[ex] = LoadLogbookForExpedition(ex) # this actually loads the logbook for one expo - for lbe in trips: - year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe] - expd[year] += 1 - yt=0 - for y in expd: - # print("{} {}".format(y, expd[y]), nlbe[y]) - yt += expd[y] - print("total {} log entries in all expeditions".format(yt)) - + # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock + # yt = 0 + # for r in map(LoadLogbookForExpedition, actuals): + # yt = r + + yt = 0 + for e in nlbe: + yt += nlbe[e] + print(f"total {yt:,} log entries parsed in all expeditions") + if yt != len(trips): + print(f"** total trips in ObjStore:{len(trips):,}") + try: shelvfilenm = 'logbktrips.shelve' # ".db" automatically apended after python 3.8 with shelve.open(shelvfilenm, writeback=True) as odb: -- cgit v1.2.3