summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
authorWookey <wookey@wookware.org>2011-07-11 23:28:23 +0100
committerWookey <wookey@wookware.org>2011-07-11 23:28:23 +0100
commitded3d58da16a609ce49fa393b70a93acd22a9d1e (patch)
tree24de35f27ab4783629bee9a8424540cecd01b728 /parsers/logbooks.py
parent3b028661f627227d7325c65adc134c3831e854d3 (diff)
parentb6a1503c7a00a582fa08cb5cfb97490f8bfa07aa (diff)
downloadtroggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.gz
troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.bz2
troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.zip
rest of martin's changes, without reverting lineend issues
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py864
1 files changed, 432 insertions, 432 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index af01f46..c794f9f 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,432 +1,432 @@
-#.-*- coding: utf-8 -*-
-
-from django.conf import settings
-import core.models as models
-
-from parsers.people import GetPersonExpeditionNameLookup
-from parsers.cavetab import GetCaveLookup
-
-from django.template.defaultfilters import slugify
-
-import csv
-import re
-import datetime
-import os
-
-from utils import save_carefully
-
-#
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
-#
-
-#
-# the logbook loading section
-#
-def GetTripPersons(trippeople, expedition, logtime_underground):
- res = [ ]
- author = None
- for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
- tripperson = tripperson.strip()
- mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
- if mul:
- tripperson = mul.group(1).strip()
- if tripperson and tripperson[0] != '*':
- #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
- personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
- if not personyear:
- print "NoMatchFor: '%s'" % tripperson
- res.append((personyear, logtime_underground))
- if mul:
- author = personyear
- if not author:
- if not res:
- return None, None
- author = res[-1][0]
- return res, author
-
-def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
-# print "Getting cave for " , place
- try:
- katastNumRes=[]
- katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
- except ValueError:
- pass
- officialNameRes=list(models.Cave.objects.filter(official_name=place))
- tripCaveRes=officialNameRes+katastNumRes
-
- if len(tripCaveRes)==1:
-# print "Place " , place , "entered as" , tripCaveRes[0]
- return tripCaveRes[0]
-
- elif models.OtherCaveName.objects.filter(name=place):
- tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
-# print "Place " , place , "entered as" , tripCaveRes
- return tripCaveRes
-
- elif len(tripCaveRes)>1:
- print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
- correctIndex=input("type list index of correct cave")
- return tripCaveRes[correctIndex]
- else:
- print "No cave found for place " , place
- return
-
-
-noncaveplaces = [ "Journey", "Loser Plateau" ]
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
- """ saves a logbook entry and related persontrips """
- trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
- if not author:
- print "skipping logentry", title
- return
-
-# tripCave = GetTripCave(place)
- #
- lplace = place.lower()
- if lplace not in noncaveplaces:
- cave=GetCaveLookup().get(lplace)
-
- #Check for an existing copy of the current entry, and save
- expeditionday = expedition.get_expedition_day(date)
- lookupAttribs={'date':date, 'title':title}
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
- lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
-
- for tripperson, time_underground in trippersons:
- lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
- nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
- #print nonLookupAttribs
- save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
-
-
-def ParseDate(tripdate, year):
- """ Interprets dates in the expo logbooks and returns a correct datetime.date object """
- mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
- mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
- if mdatestandard:
- assert mdatestandard.group(1) == year, (tripdate, year)
- year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
- elif mdategoof:
- assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
- yadd = int(year[:2]) * 100
- day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
- else:
- assert False, tripdate
- return datetime.date(year, month, day)
-
-# 2007, 2008, 2006
-def Parselogwikitxt(year, expedition, txt):
- trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
- for triphead, triptext in trippara:
- tripheadp = triphead.split("|")
- #print "ttt", tripheadp
- assert len(tripheadp) == 3, (tripheadp, triptext)
- tripdate, tripplace, trippeople = tripheadp
- tripsplace = tripplace.split(" - ")
- tripcave = tripsplace[0].strip()
-
- tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
- if tul:
- #assert len(tul) <= 1, (triphead, triptext)
- #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
- tu = tul[0][0]
- else:
- tu = ""
- #assert tripcave == "Journey", (triphead, triptext)
-
- #print tripdate
- ldate = ParseDate(tripdate.strip(), year)
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-# 2002, 2004, 2005
-def Parseloghtmltxt(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
-
- s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
- \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
- \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
- \s*<div\s+class="trippeople">\s*(.*?)</div>
- \s*<div\s+class="triptitle">\s*(.*?)</div>
- ([\s\S]*?)
- \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
- \s*$
- ''', trippara)
- if not s:
- if not re.search("Rigging Guide", trippara):
- print "can't parse: ", trippara # this is 2007 which needs editing
- #assert s, trippara
- continue
-
- tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
- ldate = ParseDate(tripdate.strip(), year)
- #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
- trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
- triptitles = triptitle.split(" - ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-# main parser for pre-2001. simpler because the data has been hacked so much to fit it
-def Parseloghtml01(year, expedition, txt):
- tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
- assert s, trippara[:300]
- tripheader, triptext = s.group(1), s.group(2)
- mtripid = re.search('<a id="(.*?)"', tripheader)
- tripid = mtripid and mtripid.group(1) or ""
- tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
-
- #print " ", [tripheader]
- #continue
-
- tripdate, triptitle, trippeople = tripheader.split("|")
- ldate = ParseDate(tripdate.strip(), year)
-
- mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
- if mtu:
- tu = mtu.group(1)
- triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
- else:
- tu = ""
-
- triptitles = triptitle.split(" - ")
- tripcave = triptitles[0].strip()
-
- ltriptext = triptext
-
- mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
- if mtail:
- #print mtail.group(0)
- ltriptext = ltriptext[:mtail.start(0)]
- ltriptext = re.sub("</p>", "", ltriptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
- #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
- ltriptext = re.sub("</?u>", "_", ltriptext)
- ltriptext = re.sub("</?i>", "''", ltriptext)
- ltriptext = re.sub("</?b>", "'''", ltriptext)
-
-
- #print ldate, trippeople.strip()
- # could includ the tripid (url link for cross referencing)
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-def Parseloghtml03(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
- assert s, trippara
- tripheader, triptext = s.group(1), s.group(2)
- tripheader = re.sub("&nbsp;", " ", tripheader)
- tripheader = re.sub("\s+", " ", tripheader).strip()
- sheader = tripheader.split(" -- ")
- tu = ""
- if re.match("T/U|Time underwater", sheader[-1]):
- tu = sheader.pop()
- if len(sheader) != 3:
- print "header not three pieces", sheader
- tripdate, triptitle, trippeople = sheader
- ldate = ParseDate(tripdate.strip(), year)
- triptitles = triptitle.split(" , ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-yearlinks = [
- ("2009", "2009/2009logbook.txt", Parselogwikitxt),
- ("2008", "2008/2008logbook.txt", Parselogwikitxt),
- ("2007", "2007/logbook.html", Parseloghtmltxt),
- ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
- ("2005", "2005/logbook.html", Parseloghtmltxt),
- ("2004", "2004/logbook.html", Parseloghtmltxt),
- ("2003", "2003/logbook.html", Parseloghtml03),
- ("2002", "2002/logbook.html", Parseloghtmltxt),
- ("2001", "2001/log.htm", Parseloghtml01),
- ("2000", "2000/log.htm", Parseloghtml01),
- ("1999", "1999/log.htm", Parseloghtml01),
- ("1998", "1998/log.htm", Parseloghtml01),
- ("1997", "1997/log.htm", Parseloghtml01),
- ("1996", "1996/log.htm", Parseloghtml01),
- ("1995", "1995/log.htm", Parseloghtml01),
- ("1994", "1994/log.htm", Parseloghtml01),
- ("1993", "1993/log.htm", Parseloghtml01),
- ("1992", "1992/log.htm", Parseloghtml01),
- ("1991", "1991/log.htm", Parseloghtml01),
- ]
-
-def SetDatesFromLogbookEntries(expedition):
- """
- Sets the date_from and date_to field for an expedition based on persontrips.
- Then sets the expedition date_from and date_to based on the personexpeditions.
- """
- for personexpedition in expedition.personexpedition_set.all():
- persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
- # sequencing is difficult to do
- lprevpersontrip = None
- for persontrip in persontrips:
- persontrip.persontrip_prev = lprevpersontrip
- if lprevpersontrip:
- lprevpersontrip.persontrip_next = persontrip
- lprevpersontrip.save()
- persontrip.persontrip_next = None
- lprevpersontrip = persontrip
- persontrip.save()
-
-
-
-def LoadLogbookForExpedition(expedition):
- """ Parses all logbook entries for one expedition """
-
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- year = str(expedition.year)
- for lyear, lloc, parsefunc in yearlinks:
- if lyear == year:
- break
- fin = open(os.path.join(expowebbase, lloc))
- print "opennning", lloc
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
- return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
-
-
-def LoadLogbooks():
- """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
-
- #Deletion has been moved to a seperate function to enable the non-destructive importing
- #models.LogbookEntry.objects.all().delete()
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
- #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
-
- for year, lloc, parsefunc in yearlinks:
- expedition = models.Expedition.objects.filter(year = year)[0]
- fin = open(os.path.join(expowebbase, lloc))
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
-
-dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
-
-def parseAutoLogBookEntry(filename):
- errors = []
- f = open(filename, "r")
- contents = f.read()
- f.close()
-
- dateMatch = dateRegex.search(contents)
- if dateMatch:
- year, month, day = [int(x) for x in dateMatch.groups()]
- date = datetime.date(year, month, day)
- else:
- errors.append("Date could not be found")
-
- expeditionYearMatch = expeditionYearRegex.search(contents)
- if expeditionYearMatch:
- try:
- expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
- personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
- except models.Expedition.DoesNotExist:
- errors.append("Expedition not in database")
- else:
- errors.append("Expediton Year could not be parsed")
-
- titleMatch = titleRegex.search(contents)
- if titleMatch:
- title, = titleMatch.groups()
- if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
- errors.append("Title too long")
- else:
- errors.append("Title could not be found")
-
- caveMatch = caveRegex.search(contents)
- if caveMatch:
- caveRef, = caveMatch.groups()
- try:
- cave = models.getCaveByReference(caveRef)
- except AssertionError:
- cave = None
- errors.append("Cave not found in database")
- else:
- cave = None
-
- locationMatch = locationRegex.search(contents)
- if locationMatch:
- location, = locationMatch.groups()
- else:
- location = None
-
- if cave is None and location is None:
- errors.append("Location nor cave could not be found")
-
- reportMatch = reportRegex.search(contents)
- if reportMatch:
- report, = reportMatch.groups()
- else:
- errors.append("Contents could not be found")
- if errors:
- return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
- people = []
- for personMatch in personRegex.findall(contents):
- nameAuthorMatch = nameAuthorRegex.search(contents)
- if nameAuthorMatch:
- author, name = nameAuthorMatch.groups()
- if name.lower() in personExpeditionNameLookup:
- personExpo = personExpeditionNameLookup[name.lower()]
- else:
- errors.append("Person could not be found in database")
- author = bool(author)
- else:
- errors.append("Persons name could not be found")
-
- TUMatch = TURegex.search(contents)
- if TUMatch:
- TU, = TUMatch.groups()
- else:
- errors.append("TU could not be found")
- if not errors:
- people.append((name, author, TU))
- if errors:
- return errors # Bail out before commiting to the database
- logbookEntry = models.LogbookEntry(date = date,
- expedition = expedition,
- title = title, cave = cave, place = location,
- text = report, slug = slugify(title)[:50],
- filename = filename)
- logbookEntry.save()
- for name, author, TU in people:
- models.PersonTrip(personexpedition = personExpo,
- time_underground = TU,
- logbook_entry = logbookEntry,
- is_logbook_entry_author = author).save()
- print logbookEntry
+#.-*- coding: utf-8 -*-
+
+from django.conf import settings
+import core.models as models
+
+from parsers.people import GetPersonExpeditionNameLookup
+from parsers.cavetab import GetCaveLookup
+
+from django.template.defaultfilters import slugify
+
+import csv
+import re
+import datetime
+import os
+
+from utils import save_carefully
+
+#
+# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
+# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+#
+
+#
+# the logbook loading section
+#
+def GetTripPersons(trippeople, expedition, logtime_underground):
+ res = [ ]
+ author = None
+ for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
+ tripperson = tripperson.strip()
+ mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
+ if mul:
+ tripperson = mul.group(1).strip()
+ if tripperson and tripperson[0] != '*':
+ #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
+ if not personyear:
+ print "NoMatchFor: '%s'" % tripperson
+ res.append((personyear, logtime_underground))
+ if mul:
+ author = personyear
+ if not author:
+ if not res:
+ return None, None
+ author = res[-1][0]
+ return res, author
+
+def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
+# print "Getting cave for " , place
+ try:
+ katastNumRes=[]
+ katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
+ except ValueError:
+ pass
+ officialNameRes=list(models.Cave.objects.filter(official_name=place))
+ tripCaveRes=officialNameRes+katastNumRes
+
+ if len(tripCaveRes)==1:
+# print "Place " , place , "entered as" , tripCaveRes[0]
+ return tripCaveRes[0]
+
+ elif models.OtherCaveName.objects.filter(name=place):
+ tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
+# print "Place " , place , "entered as" , tripCaveRes
+ return tripCaveRes
+
+ elif len(tripCaveRes)>1:
+ print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
+ correctIndex=input("type list index of correct cave")
+ return tripCaveRes[correctIndex]
+ else:
+ print "No cave found for place " , place
+ return
+
+
+noncaveplaces = [ "Journey", "Loser Plateau" ]
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
+ """ saves a logbook entry and related persontrips """
+ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
+ if not author:
+ print "skipping logentry", title
+ return
+
+# tripCave = GetTripCave(place)
+ #
+ lplace = place.lower()
+ if lplace not in noncaveplaces:
+ cave=GetCaveLookup().get(lplace)
+
+ #Check for an existing copy of the current entry, and save
+ expeditionday = expedition.get_expedition_day(date)
+ lookupAttribs={'date':date, 'title':title}
+ nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
+ lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
+
+ for tripperson, time_underground in trippersons:
+ lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
+ nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
+ #print nonLookupAttribs
+ save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
+
+
+def ParseDate(tripdate, year):
+ """ Interprets dates in the expo logbooks and returns a correct datetime.date object """
+ mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+ if mdatestandard:
+ assert mdatestandard.group(1) == year, (tripdate, year)
+ year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+ elif mdategoof:
+ assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
+ yadd = int(year[:2]) * 100
+ day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
+ else:
+ assert False, tripdate
+ return datetime.date(year, month, day)
+
+# 2007, 2008, 2006
+def Parselogwikitxt(year, expedition, txt):
+ trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+ for triphead, triptext in trippara:
+ tripheadp = triphead.split("|")
+ #print "ttt", tripheadp
+ assert len(tripheadp) == 3, (tripheadp, triptext)
+ tripdate, tripplace, trippeople = tripheadp
+ tripsplace = tripplace.split(" - ")
+ tripcave = tripsplace[0].strip()
+
+ tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+ if tul:
+ #assert len(tul) <= 1, (triphead, triptext)
+ #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
+ tu = tul[0][0]
+ else:
+ tu = ""
+ #assert tripcave == "Journey", (triphead, triptext)
+
+ #print tripdate
+ ldate = ParseDate(tripdate.strip(), year)
+ #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+# 2002, 2004, 2005
+def Parseloghtmltxt(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+
+ s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
+ \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
+ \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
+ \s*<div\s+class="trippeople">\s*(.*?)</div>
+ \s*<div\s+class="triptitle">\s*(.*?)</div>
+ ([\s\S]*?)
+ \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
+ \s*$
+ ''', trippara)
+ if not s:
+ if not re.search("Rigging Guide", trippara):
+ print "can't parse: ", trippara # this is 2007 which needs editing
+ #assert s, trippara
+ continue
+
+ tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
+ ldate = ParseDate(tripdate.strip(), year)
+ #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
+ trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
+ trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
+ triptitles = triptitle.split(" - ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
+ tripcave = "UNKNOWN"
+ #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
+ ltriptext = re.sub("</p>", "", triptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+
+# main parser for pre-2001. simpler because the data has been hacked so much to fit it
+def Parseloghtml01(year, expedition, txt):
+ tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
+ assert s, trippara[:300]
+ tripheader, triptext = s.group(1), s.group(2)
+ mtripid = re.search('<a id="(.*?)"', tripheader)
+ tripid = mtripid and mtripid.group(1) or ""
+ tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
+
+ #print " ", [tripheader]
+ #continue
+
+ tripdate, triptitle, trippeople = tripheader.split("|")
+ ldate = ParseDate(tripdate.strip(), year)
+
+ mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
+ if mtu:
+ tu = mtu.group(1)
+ triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
+ else:
+ tu = ""
+
+ triptitles = triptitle.split(" - ")
+ tripcave = triptitles[0].strip()
+
+ ltriptext = triptext
+
+ mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+ if mtail:
+ #print mtail.group(0)
+ ltriptext = ltriptext[:mtail.start(0)]
+ ltriptext = re.sub("</p>", "", ltriptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
+ #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
+ ltriptext = re.sub("</?u>", "_", ltriptext)
+ ltriptext = re.sub("</?i>", "''", ltriptext)
+ ltriptext = re.sub("</?b>", "'''", ltriptext)
+
+
+ #print ldate, trippeople.strip()
+ # could includ the tripid (url link for cross referencing)
+ EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+
+def Parseloghtml03(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
+ assert s, trippara
+ tripheader, triptext = s.group(1), s.group(2)
+ tripheader = re.sub("&nbsp;", " ", tripheader)
+ tripheader = re.sub("\s+", " ", tripheader).strip()
+ sheader = tripheader.split(" -- ")
+ tu = ""
+ if re.match("T/U|Time underwater", sheader[-1]):
+ tu = sheader.pop()
+ if len(sheader) != 3:
+ print "header not three pieces", sheader
+ tripdate, triptitle, trippeople = sheader
+ ldate = ParseDate(tripdate.strip(), year)
+ triptitles = triptitle.split(" , ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
+ tripcave = "UNKNOWN"
+ #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
+ ltriptext = re.sub("</p>", "", triptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+yearlinks = [
+ ("2009", "2009/2009logbook.txt", Parselogwikitxt),
+ ("2008", "2008/2008logbook.txt", Parselogwikitxt),
+ ("2007", "2007/logbook.html", Parseloghtmltxt),
+ ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
+ ("2005", "2005/logbook.html", Parseloghtmltxt),
+ ("2004", "2004/logbook.html", Parseloghtmltxt),
+ ("2003", "2003/logbook.html", Parseloghtml03),
+ ("2002", "2002/logbook.html", Parseloghtmltxt),
+ ("2001", "2001/log.htm", Parseloghtml01),
+ ("2000", "2000/log.htm", Parseloghtml01),
+ ("1999", "1999/log.htm", Parseloghtml01),
+ ("1998", "1998/log.htm", Parseloghtml01),
+ ("1997", "1997/log.htm", Parseloghtml01),
+ ("1996", "1996/log.htm", Parseloghtml01),
+ ("1995", "1995/log.htm", Parseloghtml01),
+ ("1994", "1994/log.htm", Parseloghtml01),
+ ("1993", "1993/log.htm", Parseloghtml01),
+ ("1992", "1992/log.htm", Parseloghtml01),
+ ("1991", "1991/log.htm", Parseloghtml01),
+ ]
+
+def SetDatesFromLogbookEntries(expedition):
+ """
+ Sets the date_from and date_to field for an expedition based on persontrips.
+ Then sets the expedition date_from and date_to based on the personexpeditions.
+ """
+ for personexpedition in expedition.personexpedition_set.all():
+ persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
+ # sequencing is difficult to do
+ lprevpersontrip = None
+ for persontrip in persontrips:
+ persontrip.persontrip_prev = lprevpersontrip
+ if lprevpersontrip:
+ lprevpersontrip.persontrip_next = persontrip
+ lprevpersontrip.save()
+ persontrip.persontrip_next = None
+ lprevpersontrip = persontrip
+ persontrip.save()
+
+
+
+def LoadLogbookForExpedition(expedition):
+ """ Parses all logbook entries for one expedition """
+
+ expowebbase = os.path.join(settings.EXPOWEB, "years")
+ year = str(expedition.year)
+ for lyear, lloc, parsefunc in yearlinks:
+ if lyear == year:
+ break
+ fin = open(os.path.join(expowebbase, lloc))
+ print "opennning", lloc
+ txt = fin.read().decode("latin1")
+ fin.close()
+ parsefunc(year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+ return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
+
+
+def LoadLogbooks():
+ """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
+
+ #Deletion has been moved to a seperate function to enable the non-destructive importing
+ #models.LogbookEntry.objects.all().delete()
+ expowebbase = os.path.join(settings.EXPOWEB, "years")
+ #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
+ #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
+
+ for year, lloc, parsefunc in yearlinks:
+ expedition = models.Expedition.objects.filter(year = year)[0]
+ fin = open(os.path.join(expowebbase, lloc))
+ txt = fin.read().decode("latin1")
+ fin.close()
+ parsefunc(year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+
+dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
+titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
+reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
+personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
+nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
+TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
+caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
+
+def parseAutoLogBookEntry(filename):
+ errors = []
+ f = open(filename, "r")
+ contents = f.read()
+ f.close()
+
+ dateMatch = dateRegex.search(contents)
+ if dateMatch:
+ year, month, day = [int(x) for x in dateMatch.groups()]
+ date = datetime.date(year, month, day)
+ else:
+ errors.append("Date could not be found")
+
+ expeditionYearMatch = expeditionYearRegex.search(contents)
+ if expeditionYearMatch:
+ try:
+ expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
+ personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
+ except models.Expedition.DoesNotExist:
+ errors.append("Expedition not in database")
+ else:
+ errors.append("Expediton Year could not be parsed")
+
+ titleMatch = titleRegex.search(contents)
+ if titleMatch:
+ title, = titleMatch.groups()
+ if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
+ errors.append("Title too long")
+ else:
+ errors.append("Title could not be found")
+
+ caveMatch = caveRegex.search(contents)
+ if caveMatch:
+ caveRef, = caveMatch.groups()
+ try:
+ cave = models.getCaveByReference(caveRef)
+ except AssertionError:
+ cave = None
+ errors.append("Cave not found in database")
+ else:
+ cave = None
+
+ locationMatch = locationRegex.search(contents)
+ if locationMatch:
+ location, = locationMatch.groups()
+ else:
+ location = None
+
+ if cave is None and location is None:
+ errors.append("Location nor cave could not be found")
+
+ reportMatch = reportRegex.search(contents)
+ if reportMatch:
+ report, = reportMatch.groups()
+ else:
+ errors.append("Contents could not be found")
+ if errors:
+ return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
+ people = []
+ for personMatch in personRegex.findall(contents):
+ nameAuthorMatch = nameAuthorRegex.search(contents)
+ if nameAuthorMatch:
+ author, name = nameAuthorMatch.groups()
+ if name.lower() in personExpeditionNameLookup:
+ personExpo = personExpeditionNameLookup[name.lower()]
+ else:
+ errors.append("Person could not be found in database")
+ author = bool(author)
+ else:
+ errors.append("Persons name could not be found")
+
+ TUMatch = TURegex.search(contents)
+ if TUMatch:
+ TU, = TUMatch.groups()
+ else:
+ errors.append("TU could not be found")
+ if not errors:
+ people.append((name, author, TU))
+ if errors:
+ return errors # Bail out before commiting to the database
+ logbookEntry = models.LogbookEntry(date = date,
+ expedition = expedition,
+ title = title, cave = cave, place = location,
+ text = report, slug = slugify(title)[:50],
+ filename = filename)
+ logbookEntry.save()
+ for name, author, TU in people:
+ models.PersonTrip(personexpedition = personExpo,
+ time_underground = TU,
+ logbook_entry = logbookEntry,
+ is_logbook_entry_author = author).save()
+ print logbookEntry