summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py273
1 files changed, 100 insertions, 173 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index d615930..6a156af 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,8 @@
import csv
-import datetime
+from datetime import datetime, date, time
import os
import re
-import time
+#import time
import pickle
import shelve
@@ -26,8 +26,6 @@ todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
-- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
-
- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser,
or it is broken/incomplete and need hand-editing.
@@ -46,26 +44,35 @@ todo='''
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
+
+- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
'''
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ]
-logdataissues = {}
+logdataissues = TROG['issues']['logdataissues']
trips ={}
#
# the logbook loading section
#
-def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
+def set_trip_id(year, seq):
+ tid= f"{year}.s{seq:02d}"
+ return tid
+
+
+def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = [ ]
author = None
round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
- if tid =="!":
- tid = expedition.year + "." + tripperson
+ #print(f'# {tid}')
+
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
+ if not tid:
+ tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
@@ -86,23 +93,24 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
author = res[-1][0]
return res, author
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why?
"""
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
except:
- message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year)
+ message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
+ print(message)
return
if not author:
- print(" ! - Skipping logentry: " + title + " - no author for entry")
- message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
+ message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
+ print(message)
return
# This needs attention. The slug field is derived from 'title'
@@ -113,16 +121,16 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
cave=None
if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace)
- # message = " ! - '" + lplace + "' place not in noncaveplaces."
- # print(message)
- # DataIssue.objects.create(parser='logbooks', message=message)
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
lookupAttribs={'date':date, 'title':title}
# 'cave' is converted to a string doing this, which renders as the cave slug.
# but it is a db query which we should try to avoid - rewrite this
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type}
+
+ #NEW sluf for a logbook entry here! Use the unique id, not the title !!!
+ slug = tid + slugify(title)[:50]
+ nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
@@ -133,31 +141,37 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
- mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
- mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
- if mdatestandard:
- if not (mdatestandard.group(1) == year):
- message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues["tripdate"]=message
- return datetime.date('1970', '01', '01')
+ try:
+ mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+ if mdatestandard:
+ if not (mdatestandard.group(1) == year):
+ message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues["tripdate"]=message
+ return datetime.date('1970', '01', '01')
+ else:
+ year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+ elif mdategoof:
+ if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
+ message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues["tripdate"]=message
+ return date('1970', '01', '01')
+ else:
+ yadd = int(year[:2]) * 100
+ day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
- year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
- elif mdategoof:
- if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
- message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
+ message = " ! - Bad date in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
- return datetime.date('1970', '01', '01')
- else:
- yadd = int(year[:2]) * 100
- day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
- else:
- message = " ! - Bad date in logbook: " + tripdate + " - " + year
+
+ return date(year, month, day)
+ except:
+ message = " ! - Failed to parse date in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
-
- return datetime.date(year, month, day)
+ return date(year, month, day)
# (2006 - not any more), 2008 - 2009
def Parselogwikitxt(year, expedition, txt):
@@ -168,6 +182,8 @@ def Parselogwikitxt(year, expedition, txt):
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
logbook_entry_count += 1
+ tid = set_trip_id(year,logbook_entry_count)
+
tripheadp = triphead.split("|")
if not (len(tripheadp) == 3):
message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
@@ -198,30 +214,24 @@ def Parselogwikitxt(year, expedition, txt):
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
- expedition=expedition, logtime_underground=0)
+ expedition=expedition, logtime_underground=0, tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
- tu, "wiki", tripid, logbook_entry_count)
+ tu, "wiki", tripid, logbook_entry_count, tid=tid)
-def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
+def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
# This will need additional functions to replicate the persontrip calculation and storage. For the
# moment we leave all that to be done in the django db
global trips # should be a singleton TROG eventually
global logdataissues
- if tripid1 is None or tripid1 =="":
- tid= "n{}-s{:02d}".format(str(date),seq)
- #print(" - New id ",tid)
- else:
- tid= tripid1
-
if tid in trips:
tyear, tdate, *trest = trips[tid]
msg = f" ! DUPLICATE on {tdate} id: '{tid}'"
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
- tid= "d{}-s{:02d}".format(str(date),seq)
+ tid = set_trip_id(str(date),seq)
#print(" - De-dup ",seq, tid)
logdataissues[tid]=msg
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
@@ -247,7 +257,7 @@ def Parseloghtmltxt(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
- tid= "n{}-s{:02d}".format(year,logbook_entry_count)
+ tid = set_trip_id(year,logbook_entry_count)
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
@@ -285,7 +295,7 @@ def Parseloghtmltxt(year, expedition, txt):
entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html", tripid1, logbook_entry_count)
+ "html", tripid1, logbook_entry_count, tid=tid)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
@@ -298,18 +308,26 @@ def Parseloghtml01(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
- tid= f"{year}.s{logbook_entry_count:02d}"
+ tid = set_trip_id(year,logbook_entry_count)
try:
s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
if not s:
- message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..."
+ message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search(r'<a id="(.*?)"', tripheader)
+ # if not mtripid:
+ # # not an error, this is probabluy jusyt a different year
+ # message = f" ! - Fail id trip:{tid} header:'{tripheader}'"
+ # DataIssue.objects.create(parser='logbooks', message=message)
+ # logdataissues[tid]=message
+ # print(message)
+
tripid = mtripid and mtripid.group(1) or ""
+ #print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
tripdate, triptitle, trippeople = tripheader.split("|")
@@ -336,6 +354,13 @@ def Parseloghtml01(year, expedition, txt):
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+
+ if ltriptext == "":
+ message = " ! - Zero content for logbook entry!: " + tid
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
@@ -343,16 +368,16 @@ def Parseloghtml01(year, expedition, txt):
try:
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
- entry_type="html")
-
+ entry_type="html", tid=tid)
except:
message = " ! - Enter log entry into database FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
+
try:
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html01", tripid, logbook_entry_count)
+ "html01", tripid, logbook_entry_count, tid=tid)
except:
message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
@@ -360,7 +385,7 @@ def Parseloghtml01(year, expedition, txt):
print(message)
except:
- message = " ! - Skipping logentry due to exception in: " + tid
+ message = f" ! - Skipping logentry {year} due to exception in: {tid}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
@@ -381,11 +406,11 @@ def Parseloghtml03(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
- tid= f"{year}.s{logbook_entry_count:02d}"
+ tid = set_trip_id(year,logbook_entry_count)
s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
if not ( s ) :
- message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
+ message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
@@ -419,10 +444,10 @@ def Parseloghtml03(year, expedition, txt):
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
text = ltriptext, trippeople=trippeople, expedition=expedition,
- logtime_underground=0, entry_type="html")
+ logtime_underground=0, entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
- "html03", tid, logbook_entry_count)
+ "html03", tid, logbook_entry_count, tid=tid)
def SetDatesFromLogbookEntries(expedition):
@@ -477,16 +502,15 @@ def LoadLogbookForExpedition(expedition, expect):
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None:
- print(f' - CLEANING dataissue {di.message}')
+ #print(f' - CLEANING dataissue {di.message}')
di.delete()
- print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
+ #print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
dellist = []
for key, value in logdataissues.items():
- # tripentry = year + "." + str(logbook_entry_count)
- print(f' - CLEAN [{key}]')
- if key.startswith(year + "."):
- print(f' - CLEANING logdataissues [{key:12}]: value ')
+ #print(f' - CLEANING logdataissues [{key}]: {value}')
+ if key.startswith(year):
+ #print(f' - CLEANING logdataissues [{key:12}]: {value} ')
dellist.append(key)
for i in dellist:
del logdataissues[i]
@@ -547,7 +571,9 @@ def LoadLogbookForExpedition(expedition, expect):
if logbook_parseable:
parser = globals()[parsefunc]
- parser(expedition.year, expedition, txt)
+
+ parser(expedition.year, expedition, txt) # this launches the parser
+
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
@@ -625,112 +651,13 @@ def LoadLogbooks():
odb.sync()
odb.close()
-dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
-
-def parseAutoLogBookEntry(filename):
- '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
- which is then stored in a separate location to the usual logbook.html
- But when importing logbook.html all these individual entries also need ot be parsed.
-
- This is all redundant as we are getting rid of the whole individual trip entry system
- '''
- errors = []
- f = open(filename, "r")
- contents = f.read()
- f.close()
-
- dateMatch = dateRegex.search(contents)
- if dateMatch:
- year, month, day = [int(x) for x in dateMatch.groups()]
- date = datetime.date(year, month, day)
- else:
- errors.append(" - Date could not be found")
-
- expeditionYearMatch = expeditionYearRegex.search(contents)
- if expeditionYearMatch:
- try:
- expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
- personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
- except Expedition.DoesNotExist:
- errors.append(" - Expedition not in database")
- else:
- errors.append(" - Expedition Year could not be parsed")
-
- titleMatch = titleRegex.search(contents)
- if titleMatch:
- title, = titleMatch.groups()
- if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
- errors.append(" - Title too long")
- else:
- errors.append(" - Title could not be found")
-
- caveMatch = caveRegex.search(contents)
- if caveMatch:
- caveRef, = caveMatch.groups()
- try:
- # this is a slow and uncertain function:
- cave = getCaveByReference(caveRef)
- except:
- cave = None
- errors.append(" - Cave not found in database")
- else:
- cave = None
-
- locationMatch = locationRegex.search(contents)
- if locationMatch:
- location, = locationMatch.groups()
- else:
- location = None
-
- if cave is None and location is None:
- errors.append(" - Location nor cave could not be found")
+# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
+# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
+# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
+# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
+# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
+# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
+# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
- reportMatch = reportRegex.search(contents)
- if reportMatch:
- report, = reportMatch.groups()
- else:
- errors.append(" - Contents could not be found")
- if errors:
- return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
- people = []
- for personMatch in personRegex.findall(contents):
- nameAuthorMatch = nameAuthorRegex.search(contents)
- if nameAuthorMatch:
- author, name = nameAuthorMatch.groups()
- if name.lower() in personExpeditionNameLookup:
- personExpo = personExpeditionNameLookup[name.lower()]
- else:
- errors.append(" - Person could not be found in database")
- author = bool(author)
- else:
- errors.append(" - Persons name could not be found")
-
- TUMatch = TURegex.search(contents)
- if TUMatch:
- TU, = TUMatch.groups()
- else:
- errors.append(" - TU could not be found")
- if not errors:
- people.append((name, author, TU))
- if errors:
- return errors # Bail out before committing to the database
- logbookEntry = LogbookEntry(date = date,
- expedition = expedition,
- title = title, cave = cave, place = location,
- text = report, slug = slugify(title)[:50],
- filename = filename)
- logbookEntry.save()
- for name, author, TU in people:
- PersonTrip(personexpedition = personExpo,
- time_underground = TU,
- logbook_entry = logbookEntry,
- is_logbook_entry_author = author).save()
- print(logbookEntry) \ No newline at end of file