summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'parsers')
-rw-r--r--parsers/logbooks.py201
1 files changed, 84 insertions, 117 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index cfc1a20..ce78e6d 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,5 +1,4 @@
#.-*- coding: utf-8 -*-
-
import csv
import datetime
import os
@@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
- print((" - Skipping logentry: " + title + " - no author for entry"))
+ print(" * Skipping logentry: " + title + " - no author for entry")
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
DataIssue.objects.create(parser='logbooks', message=message)
return
@@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
tripheadp = triphead.split("|")
- #print "ttt", tripheadp
assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ")
@@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul:
- #assert len(tul) <= 1, (triphead, triptext)
- #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul[0][0]
else:
tu = ""
- #assert tripcave == "Journey", (triphead, triptext)
- #print tripdate
ldate = ParseDate(tripdate.strip(), year)
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-# 2002, 2004, 2005, 2007, 2010 - 2018
+# 2002, 2004, 2005, 2007, 2010 - now
def Parseloghtmltxt(year, expedition, txt):
#print(" - Starting log html parser")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
@@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
if not s:
if not re.search(r"Rigging Guide", trippara):
print(("can't parse: ", trippara)) # this is 2007 which needs editing
- #assert s, trippara
continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
- #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
- #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
- #print("\n", tripcave, "--- ppp", trippeople, len(triptext))
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
- if logbook_entry_count == 0:
- print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
@@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
- #print " ", [tripheader]
- #continue
-
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
@@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
- #print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
- #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
-
- #print ldate, trippeople.strip()
- # could includ the tripid (url link for cross referencing)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
@@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
- #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
@@ -316,104 +293,94 @@ def SetDatesFromLogbookEntries(expedition):
def LoadLogbookForExpedition(expedition):
- """ Parses all logbook entries for one expedition """
-
+ """ Parses all logbook entries for one expedition
+ """
global logentries
-
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- yearlinks = settings.LOGBOOK_PARSER_SETTINGS
-
logbook_parseable = False
logbook_cached = False
+ yearlinks = settings.LOGBOOK_PARSER_SETTINGS
+ expologbase = os.path.join(settings.EXPOWEB, "years")
if expedition.year in yearlinks:
- # print " - Valid logbook year: ", expedition.year
- year_settings = yearlinks[expedition.year]
+ logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
+ parsefunc = yearlinks[expedition.year][1]
+ else:
+ logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
+ parsefunc = settings.DEFAULT_LOGBOOK_PARSER
+ cache_filename = logbookfile + ".cache"
+
+ try:
+ bad_cache = False
+ now = time.time()
+ cache_t = os.path.getmtime(cache_filename)
+ if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
+ bad_cache= True
+ if now - cache_t > 30*24*60*60:
+ bad_cache= True
+ if bad_cache:
+ print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
+ os.remove(cache_filename)
+ logentries=[]
+ print(" ! Removed stale or corrupt cache file")
+ raise
+ print(" - Reading cache: " + cache_filename, end='')
try:
- bad_cache = False
- cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
- now = time.time()
- cache_t = os.path.getmtime(cache_filename)
- file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
- if file_t - cache_t > 2: # at least 2 secs later
- #print " - Cache is stale."
- bad_cache= True
- if now - cache_t > 30*24*60*60:
- #print " - Cache is more than 30 days old."
- bad_cache= True
- if bad_cache:
- print(" - Cache is either stale or more than 30 days old. Deleting it.")
- os.remove(cache_filename)
- logentries=[]
- raise
- print((" - Reading cache: " + cache_filename ))
- try:
- with open(cache_filename, "rb") as f:
- logentries = pickle.load(f)
- print(" - Loaded ", len(logentries), " objects")
- logbook_cached = True
- except:
- print(" - Failed to load corrupt cache. Deleting it.\n")
- os.remove(cache_filename)
- logentries=[]
- raise
+ with open(cache_filename, "rb") as f:
+ logentries = pickle.load(f)
+ print(" -- Loaded ", len(logentries), " log entries")
+ logbook_cached = True
except:
- print(" - Opening logbook: ")
- file_in = open(os.path.join(expowebbase, year_settings[0]),'rb')
+ print("\n ! Failed to load corrupt cache. Deleting it.\n")
+ os.remove(cache_filename)
+ logentries=[]
+ raise
+ except : # no cache found
+ #print(" - No cache \"" + cache_filename +"\"")
+ try:
+ file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
file_in.close()
- parsefunc = year_settings[1]
logbook_parseable = True
- print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]))
-
- if logbook_parseable:
- parser = globals()[parsefunc]
- parser(expedition.year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
- # and this has also stored all the objects in logentries[]
- print(" - Storing " , len(logentries), " log entries")
- cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
- with open(cache_filename, "wb") as f:
- pickle.dump(logentries, f, 2)
- logentries=[] # flush for next year
-
- if logbook_cached:
- i=0
- for entrytuple in range(len(logentries)):
- date, place, title, text, trippeople, expedition, logtime_underground, \
- entry_type = logentries[i]
- #print " - - obj ", i, date, title
- EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
- entry_type)
- i +=1
- else:
- try:
- file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
- txt = file_in.read().decode("latin1")
- file_in.close()
- logbook_parseable = True
- print("No set parser found using default")
- parsefunc = settings.DEFAULT_LOGBOOK_PARSER
- except (IOError):
- logbook_parseable = False
- print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
+ print((" - Using: " + parsefunc + " to parse " + logbookfile))
+ except (IOError):
+ logbook_parseable = False
+ print((" ! Couldn't open logbook " + logbookfile))
+
+ if logbook_parseable:
+ parser = globals()[parsefunc]
+ parser(expedition.year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+ # and this has also stored all the log entries in logentries[]
+ if len(logentries) >0:
+ print(" - Cacheing " , len(logentries), " log entries")
+ with open(cache_filename, "wb") as fc:
+ pickle.dump(logentries, fc, 2)
+ else:
+ print(" ! NO TRIP entries found in logbook, check the syntax.")
+ logentries=[] # flush for next year
- #return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
+ if logbook_cached:
+ i=0
+ for entrytuple in range(len(logentries)):
+ date, place, title, text, trippeople, expedition, logtime_underground, \
+ entry_type = logentries[i]
+ EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
+ entry_type)
+ i +=1
def LoadLogbooks():
- """ This is the master function for parsing all logbooks into the Troggle database. """
-
- # Clear the logbook data issues as we are reloading
+ """ This is the master function for parsing all logbooks into the Troggle database.
+ """
DataIssue.objects.filter(parser='logbooks').delete()
- # Fetch all expos
expos = Expedition.objects.all()
+ nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
+ "1985","1986","1987","1988","1989","1990",]
for expo in expos:
- print(("\nLoading Logbook for: " + expo.year))
-
- # Load logbook for expo
- LoadLogbookForExpedition(expo)
+ if expo.year not in nologbook:
+ print((" - Logbook for: " + expo.year))
+ LoadLogbookForExpedition(expo)
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
@@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day)
else:
- errors.append("Date could not be found")
+ errors.append(" - Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch:
@@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except Expedition.DoesNotExist:
- errors.append("Expedition not in database")
+ errors.append(" - Expedition not in database")
else:
- errors.append("Expedition Year could not be parsed")
+ errors.append(" - Expedition Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch:
title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
- errors.append("Title too long")
+ errors.append(" - Title too long")
else:
- errors.append("Title could not be found")
+ errors.append(" - Title could not be found")
caveMatch = caveRegex.search(contents)
if caveMatch:
@@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
cave = getCaveByReference(caveRef)
except AssertionError:
cave = None
- errors.append("Cave not found in database")
+ errors.append(" - Cave not found in database")
else:
cave = None
@@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
location = None
if cave is None and location is None:
- errors.append("Location nor cave could not be found")
+ errors.append(" - Location nor cave could not be found")
reportMatch = reportRegex.search(contents)
if reportMatch:
report, = reportMatch.groups()
else:
- errors.append("Contents could not be found")
+ errors.append(" - Contents could not be found")
if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = []
@@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()]
else:
- errors.append("Person could not be found in database")
+ errors.append(" - Person could not be found in database")
author = bool(author)
else:
- errors.append("Persons name could not be found")
+ errors.append(" - Persons name could not be found")
TUMatch = TURegex.search(contents)
if TUMatch:
TU, = TUMatch.groups()
else:
- errors.append("TU could not be found")
+ errors.append(" - TU could not be found")
if not errors:
people.append((name, author, TU))
if errors:
- return errors # Bail out before commiting to the database
+ return errors # Bail out before committing to the database
logbookEntry = LogbookEntry(date = date,
expedition = expedition,
title = title, cave = cave, place = location,