From 78cedb2070c7e9d4e4eb418ecde7b60ba428273f Mon Sep 17 00:00:00 2001 From: Wookey Date: Mon, 11 Jul 2011 02:10:22 +0100 Subject: remove all the DOS linefeeds --- parsers/logbooks.py | 864 ++++++++++++++++++++++++++-------------------------- 1 file changed, 432 insertions(+), 432 deletions(-) (limited to 'parsers/logbooks.py') diff --git a/parsers/logbooks.py b/parsers/logbooks.py index af01f46..c794f9f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,432 +1,432 @@ -#.-*- coding: utf-8 -*- - -from django.conf import settings -import core.models as models - -from parsers.people import GetPersonExpeditionNameLookup -from parsers.cavetab import GetCaveLookup - -from django.template.defaultfilters import slugify - -import csv -import re -import datetime -import os - -from utils import save_carefully - -# -# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and -# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) -# - -# -# the logbook loading section -# -def GetTripPersons(trippeople, expedition, logtime_underground): - res = [ ] - author = None - for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople): - tripperson = tripperson.strip() - mul = re.match("(.*?)$(?i)", tripperson) - if mul: - tripperson = mul.group(1).strip() - if tripperson and tripperson[0] != '*': - #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap) - personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) - if not personyear: - print "NoMatchFor: '%s'" % tripperson - res.append((personyear, logtime_underground)) - if mul: - author = personyear - if not author: - if not res: - return None, None - author = res[-1][0] - return res, author - -def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... -# print "Getting cave for " , place - try: - katastNumRes=[] - katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place))) - except ValueError: - pass - officialNameRes=list(models.Cave.objects.filter(official_name=place)) - tripCaveRes=officialNameRes+katastNumRes - - if len(tripCaveRes)==1: -# print "Place " , place , "entered as" , tripCaveRes[0] - return tripCaveRes[0] - - elif models.OtherCaveName.objects.filter(name=place): - tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave -# print "Place " , place , "entered as" , tripCaveRes - return tripCaveRes - - elif len(tripCaveRes)>1: - print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes) - correctIndex=input("type list index of correct cave") - return tripCaveRes[correctIndex] - else: - print "No cave found for place " , place - return - - -noncaveplaces = [ "Journey", "Loser Plateau" ] -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground): - """ saves a logbook entry and related persontrips """ - trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) - if not author: - print "skipping logentry", title - return - -# tripCave = GetTripCave(place) - # - lplace = place.lower() - if lplace not in noncaveplaces: - cave=GetCaveLookup().get(lplace) - - #Check for an existing copy of the current entry, and save - expeditionday = expedition.get_expedition_day(date) - lookupAttribs={'date':date, 'title':title} - nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]} - lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) - - for tripperson, time_underground in trippersons: - lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} - nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} - #print nonLookupAttribs - save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs) - - -def ParseDate(tripdate, year): - """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ - mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) - mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) - if mdatestandard: - assert mdatestandard.group(1) == year, (tripdate, year) - year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) - elif mdategoof: - assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups() - yadd = int(year[:2]) * 100 - day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd - else: - assert False, tripdate - return datetime.date(year, month, day) - -# 2007, 2008, 2006 -def Parselogwikitxt(year, expedition, txt): - trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt) - for triphead, triptext in trippara: - tripheadp = triphead.split("|") - #print "ttt", tripheadp - assert len(tripheadp) == 3, (tripheadp, triptext) - tripdate, tripplace, trippeople = tripheadp - tripsplace = tripplace.split(" - ") - tripcave = tripsplace[0].strip() - - tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) - if tul: - #assert len(tul) <= 1, (triphead, triptext) - #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) - tu = tul[0][0] - else: - tu = "" - #assert tripcave == "Journey", (triphead, triptext) - - #print tripdate - ldate = ParseDate(tripdate.strip(), year) - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) - EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - -# 2002, 2004, 2005 -def Parseloghtmltxt(year, expedition, txt): - tripparas = re.findall("([\s\S]*?)(?=.*?\s*

)? # second date - \s*(?:\s*)? - \s*(.*?)(?:

)? - \s*\s*(.*?) - \s*\s*(.*?) - ([\s\S]*?) - \s*(?:\s*(.*?))? - \s*$ - ''', trippara) - if not s: - if not re.search("Rigging Guide", trippara): - print "can't parse: ", trippara # this is 2007 which needs editing - #assert s, trippara - continue - - tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() - ldate = ParseDate(tripdate.strip(), year) - #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) - trippeople = re.sub("Ol(?!l)", "Olly", trippeople) - trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) - triptitles = triptitle.split(" - ") - if len(triptitles) >= 2: - tripcave = triptitles[0] - else: - tripcave = "UNKNOWN" - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) - ltriptext = re.sub("

", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("

", "\n\n", ltriptext).strip() - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - - -# main parser for pre-2001. simpler because the data has been hacked so much to fit it -def Parseloghtml01(year, expedition, txt): - tripparas = re.findall("([\s\S]*?)(?=)?(.*?)(.*)$(?i)", trippara) - assert s, trippara[:300] - tripheader, triptext = s.group(1), s.group(2) - mtripid = re.search(']*>", "", tripheader) - - #print " ", [tripheader] - #continue - - tripdate, triptitle, trippeople = tripheader.split("|") - ldate = ParseDate(tripdate.strip(), year) - - mtu = re.search(']*>(T/?U.*)', triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - - ltriptext = triptext - - mtail = re.search('(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) - if mtail: - #print mtail.group(0) - ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub("

", "", ltriptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("

|
", "\n\n", ltriptext).strip() - #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) - ltriptext = re.sub("", "_", ltriptext) - ltriptext = re.sub("", "''", ltriptext) - ltriptext = re.sub("", "'''", ltriptext) - - - #print ldate, trippeople.strip() - # could includ the tripid (url link for cross referencing) - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - - -def Parseloghtml03(year, expedition, txt): - tripparas = re.findall("([\s\S]*?)(?=(.*?)

(.*)$", trippara) - assert s, trippara - tripheader, triptext = s.group(1), s.group(2) - tripheader = re.sub(" ", " ", tripheader) - tripheader = re.sub("\s+", " ", tripheader).strip() - sheader = tripheader.split(" -- ") - tu = "" - if re.match("T/U|Time underwater", sheader[-1]): - tu = sheader.pop() - if len(sheader) != 3: - print "header not three pieces", sheader - tripdate, triptitle, trippeople = sheader - ldate = ParseDate(tripdate.strip(), year) - triptitles = triptitle.split(" , ") - if len(triptitles) >= 2: - tripcave = triptitles[0] - else: - tripcave = "UNKNOWN" - #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) - ltriptext = re.sub("

", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("

", "\n\n", ltriptext).strip() - ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - -yearlinks = [ - ("2009", "2009/2009logbook.txt", Parselogwikitxt), - ("2008", "2008/2008logbook.txt", Parselogwikitxt), - ("2007", "2007/logbook.html", Parseloghtmltxt), - ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), - ("2005", "2005/logbook.html", Parseloghtmltxt), - ("2004", "2004/logbook.html", Parseloghtmltxt), - ("2003", "2003/logbook.html", Parseloghtml03), - ("2002", "2002/logbook.html", Parseloghtmltxt), - ("2001", "2001/log.htm", Parseloghtml01), - ("2000", "2000/log.htm", Parseloghtml01), - ("1999", "1999/log.htm", Parseloghtml01), - ("1998", "1998/log.htm", Parseloghtml01), - ("1997", "1997/log.htm", Parseloghtml01), - ("1996", "1996/log.htm", Parseloghtml01), - ("1995", "1995/log.htm", Parseloghtml01), - ("1994", "1994/log.htm", Parseloghtml01), - ("1993", "1993/log.htm", Parseloghtml01), - ("1992", "1992/log.htm", Parseloghtml01), - ("1991", "1991/log.htm", Parseloghtml01), - ] - -def SetDatesFromLogbookEntries(expedition): - """ - Sets the date_from and date_to field for an expedition based on persontrips. - Then sets the expedition date_from and date_to based on the personexpeditions. - """ - for personexpedition in expedition.personexpedition_set.all(): - persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date') - # sequencing is difficult to do - lprevpersontrip = None - for persontrip in persontrips: - persontrip.persontrip_prev = lprevpersontrip - if lprevpersontrip: - lprevpersontrip.persontrip_next = persontrip - lprevpersontrip.save() - persontrip.persontrip_next = None - lprevpersontrip = persontrip - persontrip.save() - - - -def LoadLogbookForExpedition(expedition): - """ Parses all logbook entries for one expedition """ - - expowebbase = os.path.join(settings.EXPOWEB, "years") - year = str(expedition.year) - for lyear, lloc, parsefunc in yearlinks: - if lyear == year: - break - fin = open(os.path.join(expowebbase, lloc)) - print "opennning", lloc - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) - return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) - - -def LoadLogbooks(): - """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """ - - #Deletion has been moved to a seperate function to enable the non-destructive importing - #models.LogbookEntry.objects.all().delete() - expowebbase = os.path.join(settings.EXPOWEB, "years") - #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite - #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite - - for year, lloc, parsefunc in yearlinks: - expedition = models.Expedition.objects.filter(year = year)[0] - fin = open(os.path.join(expowebbase, lloc)) - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) - -dateRegex = re.compile('(\d\d\d\d)-(\d\d)-(\d\d)', re.S) -expeditionYearRegex = re.compile('(.*?)', re.S) -titleRegex = re.compile('

(.*?)

', re.S) -reportRegex = re.compile('(.*)\s*', re.S) -personRegex = re.compile('(.*?)', re.S) -nameAuthorRegex = re.compile('(.*?)', re.S) -TURegex = re.compile('([0-9]*\.?[0-9]+)', re.S) -locationRegex = re.compile('(.*?)', re.S) -caveRegex = re.compile('(.*?)', re.S) - -def parseAutoLogBookEntry(filename): - errors = [] - f = open(filename, "r") - contents = f.read() - f.close() - - dateMatch = dateRegex.search(contents) - if dateMatch: - year, month, day = [int(x) for x in dateMatch.groups()] - date = datetime.date(year, month, day) - else: - errors.append("Date could not be found") - - expeditionYearMatch = expeditionYearRegex.search(contents) - if expeditionYearMatch: - try: - expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0]) - personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) - except models.Expedition.DoesNotExist: - errors.append("Expedition not in database") - else: - errors.append("Expediton Year could not be parsed") - - titleMatch = titleRegex.search(contents) - if titleMatch: - title, = titleMatch.groups() - if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: - errors.append("Title too long") - else: - errors.append("Title could not be found") - - caveMatch = caveRegex.search(contents) - if caveMatch: - caveRef, = caveMatch.groups() - try: - cave = models.getCaveByReference(caveRef) - except AssertionError: - cave = None - errors.append("Cave not found in database") - else: - cave = None - - locationMatch = locationRegex.search(contents) - if locationMatch: - location, = locationMatch.groups() - else: - location = None - - if cave is None and location is None: - errors.append("Location nor cave could not be found") - - reportMatch = reportRegex.search(contents) - if reportMatch: - report, = reportMatch.groups() - else: - errors.append("Contents could not be found") - if errors: - return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. - people = [] - for personMatch in personRegex.findall(contents): - nameAuthorMatch = nameAuthorRegex.search(contents) - if nameAuthorMatch: - author, name = nameAuthorMatch.groups() - if name.lower() in personExpeditionNameLookup: - personExpo = personExpeditionNameLookup[name.lower()] - else: - errors.append("Person could not be found in database") - author = bool(author) - else: - errors.append("Persons name could not be found") - - TUMatch = TURegex.search(contents) - if TUMatch: - TU, = TUMatch.groups() - else: - errors.append("TU could not be found") - if not errors: - people.append((name, author, TU)) - if errors: - return errors # Bail out before commiting to the database - logbookEntry = models.LogbookEntry(date = date, - expedition = expedition, - title = title, cave = cave, place = location, - text = report, slug = slugify(title)[:50], - filename = filename) - logbookEntry.save() - for name, author, TU in people: - models.PersonTrip(personexpedition = personExpo, - time_underground = TU, - logbook_entry = logbookEntry, - is_logbook_entry_author = author).save() - print logbookEntry +#.-*- coding: utf-8 -*- + +from django.conf import settings +import core.models as models + +from parsers.people import GetPersonExpeditionNameLookup +from parsers.cavetab import GetCaveLookup + +from django.template.defaultfilters import slugify + +import csv +import re +import datetime +import os + +from utils import save_carefully + +# +# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and +# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) +# + +# +# the logbook loading section +# +def GetTripPersons(trippeople, expedition, logtime_underground): + res = [ ] + author = None + for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople): + tripperson = tripperson.strip() + mul = re.match("(.*?)$(?i)", tripperson) + if mul: + tripperson = mul.group(1).strip() + if tripperson and tripperson[0] != '*': + #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap) + personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) + if not personyear: + print "NoMatchFor: '%s'" % tripperson + res.append((personyear, logtime_underground)) + if mul: + author = personyear + if not author: + if not res: + return None, None + author = res[-1][0] + return res, author + +def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... +# print "Getting cave for " , place + try: + katastNumRes=[] + katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place))) + except ValueError: + pass + officialNameRes=list(models.Cave.objects.filter(official_name=place)) + tripCaveRes=officialNameRes+katastNumRes + + if len(tripCaveRes)==1: +# print "Place " , place , "entered as" , tripCaveRes[0] + return tripCaveRes[0] + + elif models.OtherCaveName.objects.filter(name=place): + tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave +# print "Place " , place , "entered as" , tripCaveRes + return tripCaveRes + + elif len(tripCaveRes)>1: + print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes) + correctIndex=input("type list index of correct cave") + return tripCaveRes[correctIndex] + else: + print "No cave found for place " , place + return + + +noncaveplaces = [ "Journey", "Loser Plateau" ] +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground): + """ saves a logbook entry and related persontrips """ + trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) + if not author: + print "skipping logentry", title + return + +# tripCave = GetTripCave(place) + # + lplace = place.lower() + if lplace not in noncaveplaces: + cave=GetCaveLookup().get(lplace) + + #Check for an existing copy of the current entry, and save + expeditionday = expedition.get_expedition_day(date) + lookupAttribs={'date':date, 'title':title} + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]} + lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) + + for tripperson, time_underground in trippersons: + lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} + nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} + #print nonLookupAttribs + save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs) + + +def ParseDate(tripdate, year): + """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ + mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) + mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) + if mdatestandard: + assert mdatestandard.group(1) == year, (tripdate, year) + year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) + elif mdategoof: + assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups() + yadd = int(year[:2]) * 100 + day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd + else: + assert False, tripdate + return datetime.date(year, month, day) + +# 2007, 2008, 2006 +def Parselogwikitxt(year, expedition, txt): + trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt) + for triphead, triptext in trippara: + tripheadp = triphead.split("|") + #print "ttt", tripheadp + assert len(tripheadp) == 3, (tripheadp, triptext) + tripdate, tripplace, trippeople = tripheadp + tripsplace = tripplace.split(" - ") + tripcave = tripsplace[0].strip() + + tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) + if tul: + #assert len(tul) <= 1, (triphead, triptext) + #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) + tu = tul[0][0] + else: + tu = "" + #assert tripcave == "Journey", (triphead, triptext) + + #print tripdate + ldate = ParseDate(tripdate.strip(), year) + #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + +# 2002, 2004, 2005 +def Parseloghtmltxt(year, expedition, txt): + tripparas = re.findall("([\s\S]*?)(?=.*?\s*

)? # second date + \s*(?:\s*)? + \s*(.*?)(?:

)? + \s*\s*(.*?) + \s*\s*(.*?) + ([\s\S]*?) + \s*(?:\s*(.*?))? + \s*$ + ''', trippara) + if not s: + if not re.search("Rigging Guide", trippara): + print "can't parse: ", trippara # this is 2007 which needs editing + #assert s, trippara + continue + + tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() + ldate = ParseDate(tripdate.strip(), year) + #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) + trippeople = re.sub("Ol(?!l)", "Olly", trippeople) + trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) + triptitles = triptitle.split(" - ") + if len(triptitles) >= 2: + tripcave = triptitles[0] + else: + tripcave = "UNKNOWN" + #print "\n", tripcave, "--- ppp", trippeople, len(triptext) + ltriptext = re.sub("

", "", triptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("

", "\n\n", ltriptext).strip() + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + + +# main parser for pre-2001. simpler because the data has been hacked so much to fit it +def Parseloghtml01(year, expedition, txt): + tripparas = re.findall("([\s\S]*?)(?=)?(.*?)(.*)$(?i)", trippara) + assert s, trippara[:300] + tripheader, triptext = s.group(1), s.group(2) + mtripid = re.search(']*>", "", tripheader) + + #print " ", [tripheader] + #continue + + tripdate, triptitle, trippeople = tripheader.split("|") + ldate = ParseDate(tripdate.strip(), year) + + mtu = re.search(']*>(T/?U.*)', triptext) + if mtu: + tu = mtu.group(1) + triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] + else: + tu = "" + + triptitles = triptitle.split(" - ") + tripcave = triptitles[0].strip() + + ltriptext = triptext + + mtail = re.search('(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) + if mtail: + #print mtail.group(0) + ltriptext = ltriptext[:mtail.start(0)] + ltriptext = re.sub("

", "", ltriptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("

|
", "\n\n", ltriptext).strip() + #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) + ltriptext = re.sub("", "_", ltriptext) + ltriptext = re.sub("", "''", ltriptext) + ltriptext = re.sub("", "'''", ltriptext) + + + #print ldate, trippeople.strip() + # could includ the tripid (url link for cross referencing) + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + + +def Parseloghtml03(year, expedition, txt): + tripparas = re.findall("([\s\S]*?)(?=(.*?)

(.*)$", trippara) + assert s, trippara + tripheader, triptext = s.group(1), s.group(2) + tripheader = re.sub(" ", " ", tripheader) + tripheader = re.sub("\s+", " ", tripheader).strip() + sheader = tripheader.split(" -- ") + tu = "" + if re.match("T/U|Time underwater", sheader[-1]): + tu = sheader.pop() + if len(sheader) != 3: + print "header not three pieces", sheader + tripdate, triptitle, trippeople = sheader + ldate = ParseDate(tripdate.strip(), year) + triptitles = triptitle.split(" , ") + if len(triptitles) >= 2: + tripcave = triptitles[0] + else: + tripcave = "UNKNOWN" + #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) + ltriptext = re.sub("

", "", triptext) + ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub("

", "\n\n", ltriptext).strip() + ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + +yearlinks = [ + ("2009", "2009/2009logbook.txt", Parselogwikitxt), + ("2008", "2008/2008logbook.txt", Parselogwikitxt), + ("2007", "2007/logbook.html", Parseloghtmltxt), + ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), + ("2005", "2005/logbook.html", Parseloghtmltxt), + ("2004", "2004/logbook.html", Parseloghtmltxt), + ("2003", "2003/logbook.html", Parseloghtml03), + ("2002", "2002/logbook.html", Parseloghtmltxt), + ("2001", "2001/log.htm", Parseloghtml01), + ("2000", "2000/log.htm", Parseloghtml01), + ("1999", "1999/log.htm", Parseloghtml01), + ("1998", "1998/log.htm", Parseloghtml01), + ("1997", "1997/log.htm", Parseloghtml01), + ("1996", "1996/log.htm", Parseloghtml01), + ("1995", "1995/log.htm", Parseloghtml01), + ("1994", "1994/log.htm", Parseloghtml01), + ("1993", "1993/log.htm", Parseloghtml01), + ("1992", "1992/log.htm", Parseloghtml01), + ("1991", "1991/log.htm", Parseloghtml01), + ] + +def SetDatesFromLogbookEntries(expedition): + """ + Sets the date_from and date_to field for an expedition based on persontrips. + Then sets the expedition date_from and date_to based on the personexpeditions. + """ + for personexpedition in expedition.personexpedition_set.all(): + persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date') + # sequencing is difficult to do + lprevpersontrip = None + for persontrip in persontrips: + persontrip.persontrip_prev = lprevpersontrip + if lprevpersontrip: + lprevpersontrip.persontrip_next = persontrip + lprevpersontrip.save() + persontrip.persontrip_next = None + lprevpersontrip = persontrip + persontrip.save() + + + +def LoadLogbookForExpedition(expedition): + """ Parses all logbook entries for one expedition """ + + expowebbase = os.path.join(settings.EXPOWEB, "years") + year = str(expedition.year) + for lyear, lloc, parsefunc in yearlinks: + if lyear == year: + break + fin = open(os.path.join(expowebbase, lloc)) + print "opennning", lloc + txt = fin.read().decode("latin1") + fin.close() + parsefunc(year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) + + +def LoadLogbooks(): + """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """ + + #Deletion has been moved to a seperate function to enable the non-destructive importing + #models.LogbookEntry.objects.all().delete() + expowebbase = os.path.join(settings.EXPOWEB, "years") + #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite + #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite + + for year, lloc, parsefunc in yearlinks: + expedition = models.Expedition.objects.filter(year = year)[0] + fin = open(os.path.join(expowebbase, lloc)) + txt = fin.read().decode("latin1") + fin.close() + parsefunc(year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + +dateRegex = re.compile('(\d\d\d\d)-(\d\d)-(\d\d)', re.S) +expeditionYearRegex = re.compile('(.*?)', re.S) +titleRegex = re.compile('

(.*?)

', re.S) +reportRegex = re.compile('(.*)\s*', re.S) +personRegex = re.compile('(.*?)', re.S) +nameAuthorRegex = re.compile('(.*?)', re.S) +TURegex = re.compile('([0-9]*\.?[0-9]+)', re.S) +locationRegex = re.compile('(.*?)', re.S) +caveRegex = re.compile('(.*?)', re.S) + +def parseAutoLogBookEntry(filename): + errors = [] + f = open(filename, "r") + contents = f.read() + f.close() + + dateMatch = dateRegex.search(contents) + if dateMatch: + year, month, day = [int(x) for x in dateMatch.groups()] + date = datetime.date(year, month, day) + else: + errors.append("Date could not be found") + + expeditionYearMatch = expeditionYearRegex.search(contents) + if expeditionYearMatch: + try: + expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0]) + personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) + except models.Expedition.DoesNotExist: + errors.append("Expedition not in database") + else: + errors.append("Expediton Year could not be parsed") + + titleMatch = titleRegex.search(contents) + if titleMatch: + title, = titleMatch.groups() + if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: + errors.append("Title too long") + else: + errors.append("Title could not be found") + + caveMatch = caveRegex.search(contents) + if caveMatch: + caveRef, = caveMatch.groups() + try: + cave = models.getCaveByReference(caveRef) + except AssertionError: + cave = None + errors.append("Cave not found in database") + else: + cave = None + + locationMatch = locationRegex.search(contents) + if locationMatch: + location, = locationMatch.groups() + else: + location = None + + if cave is None and location is None: + errors.append("Location nor cave could not be found") + + reportMatch = reportRegex.search(contents) + if reportMatch: + report, = reportMatch.groups() + else: + errors.append("Contents could not be found") + if errors: + return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. + people = [] + for personMatch in personRegex.findall(contents): + nameAuthorMatch = nameAuthorRegex.search(contents) + if nameAuthorMatch: + author, name = nameAuthorMatch.groups() + if name.lower() in personExpeditionNameLookup: + personExpo = personExpeditionNameLookup[name.lower()] + else: + errors.append("Person could not be found in database") + author = bool(author) + else: + errors.append("Persons name could not be found") + + TUMatch = TURegex.search(contents) + if TUMatch: + TU, = TUMatch.groups() + else: + errors.append("TU could not be found") + if not errors: + people.append((name, author, TU)) + if errors: + return errors # Bail out before commiting to the database + logbookEntry = models.LogbookEntry(date = date, + expedition = expedition, + title = title, cave = cave, place = location, + text = report, slug = slugify(title)[:50], + filename = filename) + logbookEntry.save() + for name, author, TU in people: + models.PersonTrip(personexpedition = personExpo, + time_underground = TU, + logbook_entry = logbookEntry, + is_logbook_entry_author = author).save() + print logbookEntry -- cgit v1.2.3