diff options
author | Wookey <wookey@wookware.org> | 2011-07-11 23:28:23 +0100 |
---|---|---|
committer | Wookey <wookey@wookware.org> | 2011-07-11 23:28:23 +0100 |
commit | ded3d58da16a609ce49fa393b70a93acd22a9d1e (patch) | |
tree | 24de35f27ab4783629bee9a8424540cecd01b728 /parsers/logbooks.py | |
parent | 3b028661f627227d7325c65adc134c3831e854d3 (diff) | |
parent | b6a1503c7a00a582fa08cb5cfb97490f8bfa07aa (diff) | |
download | troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.gz troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.bz2 troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.zip |
rest of martin's changes, without reverting lineend issues
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 864 |
1 files changed, 432 insertions, 432 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index af01f46..c794f9f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,432 +1,432 @@ -#.-*- coding: utf-8 -*-
-
-from django.conf import settings
-import core.models as models
-
-from parsers.people import GetPersonExpeditionNameLookup
-from parsers.cavetab import GetCaveLookup
-
-from django.template.defaultfilters import slugify
-
-import csv
-import re
-import datetime
-import os
-
-from utils import save_carefully
-
-#
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
-#
-
-#
-# the logbook loading section
-#
-def GetTripPersons(trippeople, expedition, logtime_underground):
- res = [ ]
- author = None
- for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople):
- tripperson = tripperson.strip()
- mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
- if mul:
- tripperson = mul.group(1).strip()
- if tripperson and tripperson[0] != '*':
- #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
- personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
- if not personyear:
- print "NoMatchFor: '%s'" % tripperson
- res.append((personyear, logtime_underground))
- if mul:
- author = personyear
- if not author:
- if not res:
- return None, None
- author = res[-1][0]
- return res, author
-
-def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
-# print "Getting cave for " , place
- try:
- katastNumRes=[]
- katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
- except ValueError:
- pass
- officialNameRes=list(models.Cave.objects.filter(official_name=place))
- tripCaveRes=officialNameRes+katastNumRes
-
- if len(tripCaveRes)==1:
-# print "Place " , place , "entered as" , tripCaveRes[0]
- return tripCaveRes[0]
-
- elif models.OtherCaveName.objects.filter(name=place):
- tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
-# print "Place " , place , "entered as" , tripCaveRes
- return tripCaveRes
-
- elif len(tripCaveRes)>1:
- print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
- correctIndex=input("type list index of correct cave")
- return tripCaveRes[correctIndex]
- else:
- print "No cave found for place " , place
- return
-
-
-noncaveplaces = [ "Journey", "Loser Plateau" ]
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
- """ saves a logbook entry and related persontrips """
- trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
- if not author:
- print "skipping logentry", title
- return
-
-# tripCave = GetTripCave(place)
- #
- lplace = place.lower()
- if lplace not in noncaveplaces:
- cave=GetCaveLookup().get(lplace)
-
- #Check for an existing copy of the current entry, and save
- expeditionday = expedition.get_expedition_day(date)
- lookupAttribs={'date':date, 'title':title}
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
- lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
-
- for tripperson, time_underground in trippersons:
- lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
- nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
- #print nonLookupAttribs
- save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
-
-
-def ParseDate(tripdate, year):
- """ Interprets dates in the expo logbooks and returns a correct datetime.date object """
- mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
- mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
- if mdatestandard:
- assert mdatestandard.group(1) == year, (tripdate, year)
- year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
- elif mdategoof:
- assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
- yadd = int(year[:2]) * 100
- day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
- else:
- assert False, tripdate
- return datetime.date(year, month, day)
-
-# 2007, 2008, 2006
-def Parselogwikitxt(year, expedition, txt):
- trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
- for triphead, triptext in trippara:
- tripheadp = triphead.split("|")
- #print "ttt", tripheadp
- assert len(tripheadp) == 3, (tripheadp, triptext)
- tripdate, tripplace, trippeople = tripheadp
- tripsplace = tripplace.split(" - ")
- tripcave = tripsplace[0].strip()
-
- tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
- if tul:
- #assert len(tul) <= 1, (triphead, triptext)
- #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
- tu = tul[0][0]
- else:
- tu = ""
- #assert tripcave == "Journey", (triphead, triptext)
-
- #print tripdate
- ldate = ParseDate(tripdate.strip(), year)
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-# 2002, 2004, 2005
-def Parseloghtmltxt(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
-
- s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
- \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
- \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
- \s*<div\s+class="trippeople">\s*(.*?)</div>
- \s*<div\s+class="triptitle">\s*(.*?)</div>
- ([\s\S]*?)
- \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
- \s*$
- ''', trippara)
- if not s:
- if not re.search("Rigging Guide", trippara):
- print "can't parse: ", trippara # this is 2007 which needs editing
- #assert s, trippara
- continue
-
- tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
- ldate = ParseDate(tripdate.strip(), year)
- #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
- trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
- triptitles = triptitle.split(" - ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-# main parser for pre-2001. simpler because the data has been hacked so much to fit it
-def Parseloghtml01(year, expedition, txt):
- tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
- assert s, trippara[:300]
- tripheader, triptext = s.group(1), s.group(2)
- mtripid = re.search('<a id="(.*?)"', tripheader)
- tripid = mtripid and mtripid.group(1) or ""
- tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
-
- #print " ", [tripheader]
- #continue
-
- tripdate, triptitle, trippeople = tripheader.split("|")
- ldate = ParseDate(tripdate.strip(), year)
-
- mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
- if mtu:
- tu = mtu.group(1)
- triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
- else:
- tu = ""
-
- triptitles = triptitle.split(" - ")
- tripcave = triptitles[0].strip()
-
- ltriptext = triptext
-
- mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
- if mtail:
- #print mtail.group(0)
- ltriptext = ltriptext[:mtail.start(0)]
- ltriptext = re.sub("</p>", "", ltriptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
- #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
- ltriptext = re.sub("</?u>", "_", ltriptext)
- ltriptext = re.sub("</?i>", "''", ltriptext)
- ltriptext = re.sub("</?b>", "'''", ltriptext)
-
-
- #print ldate, trippeople.strip()
- # could includ the tripid (url link for cross referencing)
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-def Parseloghtml03(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
- assert s, trippara
- tripheader, triptext = s.group(1), s.group(2)
- tripheader = re.sub(" ", " ", tripheader)
- tripheader = re.sub("\s+", " ", tripheader).strip()
- sheader = tripheader.split(" -- ")
- tu = ""
- if re.match("T/U|Time underwater", sheader[-1]):
- tu = sheader.pop()
- if len(sheader) != 3:
- print "header not three pieces", sheader
- tripdate, triptitle, trippeople = sheader
- ldate = ParseDate(tripdate.strip(), year)
- triptitles = triptitle.split(" , ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-yearlinks = [
- ("2009", "2009/2009logbook.txt", Parselogwikitxt),
- ("2008", "2008/2008logbook.txt", Parselogwikitxt),
- ("2007", "2007/logbook.html", Parseloghtmltxt),
- ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
- ("2005", "2005/logbook.html", Parseloghtmltxt),
- ("2004", "2004/logbook.html", Parseloghtmltxt),
- ("2003", "2003/logbook.html", Parseloghtml03),
- ("2002", "2002/logbook.html", Parseloghtmltxt),
- ("2001", "2001/log.htm", Parseloghtml01),
- ("2000", "2000/log.htm", Parseloghtml01),
- ("1999", "1999/log.htm", Parseloghtml01),
- ("1998", "1998/log.htm", Parseloghtml01),
- ("1997", "1997/log.htm", Parseloghtml01),
- ("1996", "1996/log.htm", Parseloghtml01),
- ("1995", "1995/log.htm", Parseloghtml01),
- ("1994", "1994/log.htm", Parseloghtml01),
- ("1993", "1993/log.htm", Parseloghtml01),
- ("1992", "1992/log.htm", Parseloghtml01),
- ("1991", "1991/log.htm", Parseloghtml01),
- ]
-
-def SetDatesFromLogbookEntries(expedition):
- """
- Sets the date_from and date_to field for an expedition based on persontrips.
- Then sets the expedition date_from and date_to based on the personexpeditions.
- """
- for personexpedition in expedition.personexpedition_set.all():
- persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
- # sequencing is difficult to do
- lprevpersontrip = None
- for persontrip in persontrips:
- persontrip.persontrip_prev = lprevpersontrip
- if lprevpersontrip:
- lprevpersontrip.persontrip_next = persontrip
- lprevpersontrip.save()
- persontrip.persontrip_next = None
- lprevpersontrip = persontrip
- persontrip.save()
-
-
-
-def LoadLogbookForExpedition(expedition):
- """ Parses all logbook entries for one expedition """
-
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- year = str(expedition.year)
- for lyear, lloc, parsefunc in yearlinks:
- if lyear == year:
- break
- fin = open(os.path.join(expowebbase, lloc))
- print "opennning", lloc
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
- return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
-
-
-def LoadLogbooks():
- """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
-
- #Deletion has been moved to a seperate function to enable the non-destructive importing
- #models.LogbookEntry.objects.all().delete()
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
- #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
-
- for year, lloc, parsefunc in yearlinks:
- expedition = models.Expedition.objects.filter(year = year)[0]
- fin = open(os.path.join(expowebbase, lloc))
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
-
-dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
-
-def parseAutoLogBookEntry(filename):
- errors = []
- f = open(filename, "r")
- contents = f.read()
- f.close()
-
- dateMatch = dateRegex.search(contents)
- if dateMatch:
- year, month, day = [int(x) for x in dateMatch.groups()]
- date = datetime.date(year, month, day)
- else:
- errors.append("Date could not be found")
-
- expeditionYearMatch = expeditionYearRegex.search(contents)
- if expeditionYearMatch:
- try:
- expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
- personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
- except models.Expedition.DoesNotExist:
- errors.append("Expedition not in database")
- else:
- errors.append("Expediton Year could not be parsed")
-
- titleMatch = titleRegex.search(contents)
- if titleMatch:
- title, = titleMatch.groups()
- if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
- errors.append("Title too long")
- else:
- errors.append("Title could not be found")
-
- caveMatch = caveRegex.search(contents)
- if caveMatch:
- caveRef, = caveMatch.groups()
- try:
- cave = models.getCaveByReference(caveRef)
- except AssertionError:
- cave = None
- errors.append("Cave not found in database")
- else:
- cave = None
-
- locationMatch = locationRegex.search(contents)
- if locationMatch:
- location, = locationMatch.groups()
- else:
- location = None
-
- if cave is None and location is None:
- errors.append("Location nor cave could not be found")
-
- reportMatch = reportRegex.search(contents)
- if reportMatch:
- report, = reportMatch.groups()
- else:
- errors.append("Contents could not be found")
- if errors:
- return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
- people = []
- for personMatch in personRegex.findall(contents):
- nameAuthorMatch = nameAuthorRegex.search(contents)
- if nameAuthorMatch:
- author, name = nameAuthorMatch.groups()
- if name.lower() in personExpeditionNameLookup:
- personExpo = personExpeditionNameLookup[name.lower()]
- else:
- errors.append("Person could not be found in database")
- author = bool(author)
- else:
- errors.append("Persons name could not be found")
-
- TUMatch = TURegex.search(contents)
- if TUMatch:
- TU, = TUMatch.groups()
- else:
- errors.append("TU could not be found")
- if not errors:
- people.append((name, author, TU))
- if errors:
- return errors # Bail out before commiting to the database
- logbookEntry = models.LogbookEntry(date = date,
- expedition = expedition,
- title = title, cave = cave, place = location,
- text = report, slug = slugify(title)[:50],
- filename = filename)
- logbookEntry.save()
- for name, author, TU in people:
- models.PersonTrip(personexpedition = personExpo,
- time_underground = TU,
- logbook_entry = logbookEntry,
- is_logbook_entry_author = author).save()
- print logbookEntry
+#.-*- coding: utf-8 -*- + +from django.conf import settings +import core.models as models + +from parsers.people import GetPersonExpeditionNameLookup +from parsers.cavetab import GetCaveLookup + +from django.template.defaultfilters import slugify + +import csv +import re +import datetime +import os + +from utils import save_carefully + +# +# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and +# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) +# + +# +# the logbook loading section +# +def GetTripPersons(trippeople, expedition, logtime_underground): + res = [ ] + author = None + for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople): + tripperson = tripperson.strip() + mul = re.match("<u>(.*?)</u>$(?i)", tripperson) + if mul: + tripperson = mul.group(1).strip() + if tripperson and tripperson[0] != '*': + #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap) + personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) + if not personyear: + print "NoMatchFor: '%s'" % tripperson + res.append((personyear, logtime_underground)) + if mul: + author = personyear + if not author: + if not res: + return None, None + author = res[-1][0] + return res, author + +def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... +# print "Getting cave for " , place + try: + katastNumRes=[] + katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place))) + except ValueError: + pass + officialNameRes=list(models.Cave.objects.filter(official_name=place)) + tripCaveRes=officialNameRes+katastNumRes + + if len(tripCaveRes)==1: +# print "Place " , place , "entered as" , tripCaveRes[0] + return tripCaveRes[0] + + elif models.OtherCaveName.objects.filter(name=place): + tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave +# print "Place " , place , "entered as" , tripCaveRes + return tripCaveRes + + elif len(tripCaveRes)>1: + print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes) + correctIndex=input("type list index of correct cave") + return tripCaveRes[correctIndex] + else: + print "No cave found for place " , place + return + + +noncaveplaces = [ "Journey", "Loser Plateau" ] +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground): + """ saves a logbook entry and related persontrips """ + trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) + if not author: + print "skipping logentry", title + return + +# tripCave = GetTripCave(place) + # + lplace = place.lower() + if lplace not in noncaveplaces: + cave=GetCaveLookup().get(lplace) + + #Check for an existing copy of the current entry, and save + expeditionday = expedition.get_expedition_day(date) + lookupAttribs={'date':date, 'title':title} + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]} + lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) + + for tripperson, time_underground in trippersons: + lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} + nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} + #print nonLookupAttribs + save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs) + + +def ParseDate(tripdate, year): + """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ + mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) + mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) + if mdatestandard: + assert mdatestandard.group(1) == year, (tripdate, year) + year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) + elif mdategoof: + assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups() + yadd = int(year[:2]) * 100 + day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd + else: + assert False, tripdate + return datetime.date(year, month, day) + +# 2007, 2008, 2006 +def Parselogwikitxt(year, expedition, txt): + trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt) + for triphead, triptext in trippara: + tripheadp = triphead.split("|") + #print "ttt", tripheadp + assert len(tripheadp) == 3, (tripheadp, triptext) + tripdate, tripplace, trippeople = tripheadp + tripsplace = tripplace.split(" - ") + tripcave = tripsplace[0].strip() + + tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) + if tul: + #assert len(tul) <= 1, (triphead, triptext) + #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) + tu = tul[0][0] + else: + tu = "" + #assert tripcave == "Journey", (triphead, triptext) + + #print tripdate + ldate = ParseDate(tripdate.strip(), year) + #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + +# 2002, 2004, 2005 +def Parseloghtmltxt(year, expedition, txt): + tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + for trippara in tripparas: + + s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date + \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? + \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)? + \s*<div\s+class="trippeople">\s*(.*?)</div> + \s*<div\s+class="triptitle">\s*(.*?)</div> + ([\s\S]*?) + \s*(?:<div\s+class="timeug">\s*(.*?)</div>)? + \s*$ + ''', trippara) + if not s: + if not re.search("Rigging Guide", trippara): + print "can't parse: ", trippara # this is 2007 which needs editing + #assert s, trippara + continue + + tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() + ldate = ParseDate(tripdate.strip(), year) + #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) + trippeople = re.sub("Ol(?!l)", "Olly", trippeople) + trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) + triptitles = triptitle.split(" - ") + if len(triptitles) >= 2: + tripcave = triptitles[0] + else: + tripcave = "UNKNOWN" + #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + ltriptext = re.sub("</p>", "", triptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + + +# main parser for pre-2001. simpler because the data has been hacked so much to fit it +def Parseloghtml01(year, expedition, txt): + tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) + for trippara in tripparas: + s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) + assert s, trippara[:300] + tripheader, triptext = s.group(1), s.group(2) + mtripid = re.search('<a id="(.*?)"', tripheader) + tripid = mtripid and mtripid.group(1) or "" + tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader) + + #print " ", [tripheader] + #continue + + tripdate, triptitle, trippeople = tripheader.split("|") + ldate = ParseDate(tripdate.strip(), year) + + mtu = re.search('<p[^>]*>(T/?U.*)', triptext) + if mtu: + tu = mtu.group(1) + triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] + else: + tu = "" + + triptitles = triptitle.split(" - ") + tripcave = triptitles[0].strip() + + ltriptext = triptext + + mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) + if mtail: + #print mtail.group(0) + ltriptext = ltriptext[:mtail.start(0)] + ltriptext = re.sub("</p>", "", ltriptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip() + #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) + ltriptext = re.sub("</?u>", "_", ltriptext) + ltriptext = re.sub("</?i>", "''", ltriptext) + ltriptext = re.sub("</?b>", "'''", ltriptext) + + + #print ldate, trippeople.strip() + # could includ the tripid (url link for cross referencing) + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + + +def Parseloghtml03(year, expedition, txt): + tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + for trippara in tripparas: + s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara) + assert s, trippara + tripheader, triptext = s.group(1), s.group(2) + tripheader = re.sub(" ", " ", tripheader) + tripheader = re.sub("\s+", " ", tripheader).strip() + sheader = tripheader.split(" -- ") + tu = "" + if re.match("T/U|Time underwater", sheader[-1]): + tu = sheader.pop() + if len(sheader) != 3: + print "header not three pieces", sheader + tripdate, triptitle, trippeople = sheader + ldate = ParseDate(tripdate.strip(), year) + triptitles = triptitle.split(" , ") + if len(triptitles) >= 2: + tripcave = triptitles[0] + else: + tripcave = "UNKNOWN" + #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) + ltriptext = re.sub("</p>", "", triptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() + ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + +yearlinks = [ + ("2009", "2009/2009logbook.txt", Parselogwikitxt), + ("2008", "2008/2008logbook.txt", Parselogwikitxt), + ("2007", "2007/logbook.html", Parseloghtmltxt), + ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), + ("2005", "2005/logbook.html", Parseloghtmltxt), + ("2004", "2004/logbook.html", Parseloghtmltxt), + ("2003", "2003/logbook.html", Parseloghtml03), + ("2002", "2002/logbook.html", Parseloghtmltxt), + ("2001", "2001/log.htm", Parseloghtml01), + ("2000", "2000/log.htm", Parseloghtml01), + ("1999", "1999/log.htm", Parseloghtml01), + ("1998", "1998/log.htm", Parseloghtml01), + ("1997", "1997/log.htm", Parseloghtml01), + ("1996", "1996/log.htm", Parseloghtml01), + ("1995", "1995/log.htm", Parseloghtml01), + ("1994", "1994/log.htm", Parseloghtml01), + ("1993", "1993/log.htm", Parseloghtml01), + ("1992", "1992/log.htm", Parseloghtml01), + ("1991", "1991/log.htm", Parseloghtml01), + ] + +def SetDatesFromLogbookEntries(expedition): + """ + Sets the date_from and date_to field for an expedition based on persontrips. + Then sets the expedition date_from and date_to based on the personexpeditions. + """ + for personexpedition in expedition.personexpedition_set.all(): + persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date') + # sequencing is difficult to do + lprevpersontrip = None + for persontrip in persontrips: + persontrip.persontrip_prev = lprevpersontrip + if lprevpersontrip: + lprevpersontrip.persontrip_next = persontrip + lprevpersontrip.save() + persontrip.persontrip_next = None + lprevpersontrip = persontrip + persontrip.save() + + + +def LoadLogbookForExpedition(expedition): + """ Parses all logbook entries for one expedition """ + + expowebbase = os.path.join(settings.EXPOWEB, "years") + year = str(expedition.year) + for lyear, lloc, parsefunc in yearlinks: + if lyear == year: + break + fin = open(os.path.join(expowebbase, lloc)) + print "opennning", lloc + txt = fin.read().decode("latin1") + fin.close() + parsefunc(year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) + + +def LoadLogbooks(): + """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """ + + #Deletion has been moved to a seperate function to enable the non-destructive importing + #models.LogbookEntry.objects.all().delete() + expowebbase = os.path.join(settings.EXPOWEB, "years") + #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite + #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite + + for year, lloc, parsefunc in yearlinks: + expedition = models.Expedition.objects.filter(year = year)[0] + fin = open(os.path.join(expowebbase, lloc)) + txt = fin.read().decode("latin1") + fin.close() + parsefunc(year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + +dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) +expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S) +titleRegex = re.compile('<H1>(.*?)</H1>', re.S) +reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S) +personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S) +nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S) +TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) +locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S) +caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S) + +def parseAutoLogBookEntry(filename): + errors = [] + f = open(filename, "r") + contents = f.read() + f.close() + + dateMatch = dateRegex.search(contents) + if dateMatch: + year, month, day = [int(x) for x in dateMatch.groups()] + date = datetime.date(year, month, day) + else: + errors.append("Date could not be found") + + expeditionYearMatch = expeditionYearRegex.search(contents) + if expeditionYearMatch: + try: + expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0]) + personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) + except models.Expedition.DoesNotExist: + errors.append("Expedition not in database") + else: + errors.append("Expediton Year could not be parsed") + + titleMatch = titleRegex.search(contents) + if titleMatch: + title, = titleMatch.groups() + if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: + errors.append("Title too long") + else: + errors.append("Title could not be found") + + caveMatch = caveRegex.search(contents) + if caveMatch: + caveRef, = caveMatch.groups() + try: + cave = models.getCaveByReference(caveRef) + except AssertionError: + cave = None + errors.append("Cave not found in database") + else: + cave = None + + locationMatch = locationRegex.search(contents) + if locationMatch: + location, = locationMatch.groups() + else: + location = None + + if cave is None and location is None: + errors.append("Location nor cave could not be found") + + reportMatch = reportRegex.search(contents) + if reportMatch: + report, = reportMatch.groups() + else: + errors.append("Contents could not be found") + if errors: + return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. + people = [] + for personMatch in personRegex.findall(contents): + nameAuthorMatch = nameAuthorRegex.search(contents) + if nameAuthorMatch: + author, name = nameAuthorMatch.groups() + if name.lower() in personExpeditionNameLookup: + personExpo = personExpeditionNameLookup[name.lower()] + else: + errors.append("Person could not be found in database") + author = bool(author) + else: + errors.append("Persons name could not be found") + + TUMatch = TURegex.search(contents) + if TUMatch: + TU, = TUMatch.groups() + else: + errors.append("TU could not be found") + if not errors: + people.append((name, author, TU)) + if errors: + return errors # Bail out before commiting to the database + logbookEntry = models.LogbookEntry(date = date, + expedition = expedition, + title = title, cave = cave, place = location, + text = report, slug = slugify(title)[:50], + filename = filename) + logbookEntry.save() + for name, author, TU in people: + models.PersonTrip(personexpedition = personExpo, + time_underground = TU, + logbook_entry = logbookEntry, + is_logbook_entry_author = author).save() + print logbookEntry |