summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py125
1 files changed, 70 insertions, 55 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index ea7c27f..5ebc671 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -109,6 +109,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
+
+ troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
+ but we are saving the same thing too many times.. Also seen in teh ObjStore mimic
"""
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
@@ -153,12 +156,14 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
+ # This cretes the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
for tripperson, time_underground in trippersons:
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
+ # this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) # PersonTrip also saved in SetDatesFromLogbookEntries
def ParseDate(tripdate, year):
@@ -243,19 +248,25 @@ def Parselogwikitxt(year, expedition, txt):
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
+ '''Called once for each logbook entry as the logbook is parsed
+ '''
# This will need additional functions to replicate the persontrip calculation and storage. For the
# moment we leave all that to be done in the django db
global trips # should be a singleton TROG eventually
global logdataissues
+
if tid in trips:
tyear, tdate, *trest = trips[tid]
- msg = f" ! DUPLICATE on {tdate} id: '{tid}'"
+ msg = f" ! DUPLICATE tid: '{tid}' on date:{tdate} "
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
tid = set_trip_id(str(date),seq)
#print(" - De-dup ",seq, tid)
logdataissues[tid]=msg
+
+ if not tid:
+ tid = set_trip_id(str(date),seq)
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
## copy a lot of checking functionality here from EnterLogIntoDbase()
@@ -267,7 +278,9 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
# message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid+"author"]=message
- pass
+ return
+
+
# 2002, 2004, 2005, 2007, 2010 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
@@ -280,6 +293,7 @@ def Parseloghtmltxt(year, expedition, txt):
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
+ print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
@@ -517,7 +531,7 @@ def SetDatesFromLogbookEntries(expedition):
lprevpersontrip.save()
persontrip.persontrip_next = None
lprevpersontrip = persontrip
- persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import.
+ #persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import.
def LoadLogbookForExpedition(expedition, expect):
@@ -579,51 +593,54 @@ def LoadLogbookForExpedition(expedition, expect):
print(" - Cache file does not exist \"" + str(cache_filename) +"\"")
expedition.save()
- now = time.time()
- bad_cache = True # emporarily disable reading the cache - buggy
- try:
- cache_t = os.path.getmtime(cache_filename)
- if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
- bad_cache= True
- if now - cache_t > 30*24*60*60:
- bad_cache= True
- if bad_cache:
- print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
- os.remove(cache_filename)
- logentries=[]
- print(" ! Removed stale or corrupt cache file")
- raise
- print(" - Reading cache: " + str(cache_filename), end='')
+ logbook_cached = False
+ if True: # enable cache system
+ now = time.time()
+ bad_cache = False # temporarily disable reading the cache - buggy
try:
- with open(cache_filename, "rb") as f:
- year,n,logentries = pickle.load(f)
- if validcache(year,n):
- print(" -- Loaded ", len(logentries), " log entries")
- logbook_cached = True
- else:
- print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache")
+ cache_t = os.path.getmtime(cache_filename)
+ if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
+ print(" - ! Cache is older than the logbook file")
+ bad_cache= True
+ if now - cache_t > 30*24*60*60:
+ print(" - ! Cache is > 30 days old")
+ bad_cache= True
+ if bad_cache:
+ print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
+ os.remove(cache_filename)
+ logentries=[]
+ print(" ! Removed stale or corrupt cache file")
raise
- except:
- print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.")
- os.remove(cache_filename)
- logentries=[]
- raise
- except :
- print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"")
- try:
- file_in = open(logbookfile,'rb')
- txt = file_in.read().decode("latin1")
- file_in.close()
- logbook_parseable = True
- print((" - Using: " + parsefunc + " to parse " + logbookfile))
- except (IOError):
- logbook_parseable = False
- print((" ! Couldn't open logbook " + logbookfile))
+ # print(" - Reading cache: " + str(cache_filename), end='')
+ try:
+ with open(cache_filename, "rb") as f:
+ year,n,logentries = pickle.load(f)
+ if validcache(year,n):
+ print(" -- Loaded ", len(logentries), " log entries")
+ logbook_cached = True
+ else:
+ print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache")
+ raise
+ except:
+ print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.")
+ os.remove(cache_filename)
+ logentries=[]
+ raise
+ except :
+ print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"")
+ try:
+ file_in = open(logbookfile,'rb')
+ txt = file_in.read().decode("latin1")
+ file_in.close()
+ logbook_parseable = True
+ except (IOError):
+ logbook_parseable = False
+ print((" ! Couldn't open logbook " + logbookfile))
if logbook_parseable:
parser = globals()[parsefunc]
-
- parser(expedition.year, expedition, txt) # this launches the parser
+ print(f' - Using parser {parsefunc}')
+ parser(expedition.year, expedition, txt) # this launches the right parser for this year
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
@@ -634,17 +651,15 @@ def LoadLogbookForExpedition(expedition, expect):
else:
print(" ! NO TRIP entries found in logbook, check the syntax.")
- if logbook_cached: # working on this bit...
- i=0
- for entrytuple in range(len(logentries)):
- date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i]
- #print(" - entry tuple " , i, " tid", tripid1)
- EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
- entry_type, tripid1)
- EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground,
- entry_type, tripid1, i)
- i +=1
- SetDatesFromLogbookEntries(expedition)
+ i=0
+ for entrytuple in logentries:
+ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
+ EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
+ entry_type, tripid1)
+ EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground,
+ entry_type, tripid1, i)
+ i +=1
+ SetDatesFromLogbookEntries(expedition)
return len(logentries)
def LoadLogbooks():
@@ -672,7 +687,7 @@ def LoadLogbooks():
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
- "1985": 22,"1984": 32,"1983": 52,"1982": 42,}
+ "1985": 24,"1984": 32,"1983": 52,"1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
try:
os.remove("loadlogbk.log")