diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/logbooks.py | 211 | ||||
-rw-r--r-- | parsers/people.py | 6 |
2 files changed, 120 insertions, 97 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index cb40f58..4554b08 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -26,16 +26,16 @@ from utils import save_carefully def GetTripPersons(trippeople, expedition, logtime_underground): res = [ ] author = None - for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople): + for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() - mul = re.match("<u>(.*?)</u>$(?i)", tripperson) + mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson) if mul: tripperson = mul.group(1).strip() if tripperson and tripperson[0] != '*': #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap) personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - print "NoMatchFor: '%s'" % tripperson + print(" - No name match for: '%s'" % tripperson) res.append((personyear, logtime_underground)) if mul: author = personyear @@ -65,11 +65,11 @@ def GetTripCave(place): #need to be fuzzier about matching h return tripCaveRes elif len(tripCaveRes)>1: - print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes) + print("Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)) correctIndex=input("type list index of correct cave") return tripCaveRes[correctIndex] else: - print "No cave found for place " , place + print("No cave found for place " , place) return @@ -78,7 +78,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ """ saves a logbook entry and related persontrips """ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) if not author: - print "skipping logentry", title + print(" - skipping logentry" + title + " no author for entry") return # tripCave = GetTripCave(place) @@ -102,8 +102,8 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ def ParseDate(tripdate, year): """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ - mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) - mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) + mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) + mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) if mdatestandard: assert mdatestandard.group(1) == year, (tripdate, year) year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) @@ -117,7 +117,7 @@ def ParseDate(tripdate, year): # 2007, 2008, 2006 def Parselogwikitxt(year, expedition, txt): - trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt) + trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: tripheadp = triphead.split("|") #print "ttt", tripheadp @@ -126,7 +126,7 @@ def Parselogwikitxt(year, expedition, txt): tripsplace = tripplace.split(" - ") tripcave = tripsplace[0].strip() - tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) + tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) if tul: #assert len(tul) <= 1, (triphead, triptext) #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) @@ -142,10 +142,14 @@ def Parselogwikitxt(year, expedition, txt): # 2002, 2004, 2005 def Parseloghtmltxt(year, expedition, txt): - tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + print(" - Using log html parser") + tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) + logbook_entry_count = 0 for trippara in tripparas: + #print(" - HR detected - maybe a trip?") + logbook_entry_count += 1 - s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date + s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)? \s*<div\s+class="trippeople">\s*(.*?)</div> @@ -155,38 +159,40 @@ def Parseloghtmltxt(year, expedition, txt): \s*$ ''', trippara) if not s: - if not re.search("Rigging Guide", trippara): - print "can't parse: ", trippara # this is 2007 which needs editing + if not re.search(r"Rigging Guide", trippara): + print("can't parse: ", trippara) # this is 2007 which needs editing #assert s, trippara continue tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() ldate = ParseDate(tripdate.strip(), year) #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) - trippeople = re.sub("Ol(?!l)", "Olly", trippeople) - trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) + trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople) + trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople) triptitles = triptitle.split(" - ") if len(triptitles) >= 2: tripcave = triptitles[0] else: tripcave = "UNKNOWN" #print "\n", tripcave, "--- ppp", trippeople, len(triptext) - ltriptext = re.sub("</p>", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"</p>", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) + if logbook_entry_count == 0: + print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") # main parser for pre-2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): - tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) + tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) assert s, trippara[:300] tripheader, triptext = s.group(1), s.group(2) - mtripid = re.search('<a id="(.*?)"', tripheader) + mtripid = re.search(r'<a id="(.*?)"', tripheader) tripid = mtripid and mtripid.group(1) or "" - tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader) + tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) #print " ", [tripheader] #continue @@ -194,7 +200,7 @@ def Parseloghtml01(year, expedition, txt): tripdate, triptitle, trippeople = tripheader.split("|") ldate = ParseDate(tripdate.strip(), year) - mtu = re.search('<p[^>]*>(T/?U.*)', triptext) + mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) if mtu: tu = mtu.group(1) triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] @@ -206,17 +212,17 @@ def Parseloghtml01(year, expedition, txt): ltriptext = triptext - mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) + mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) if mtail: #print mtail.group(0) ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub("</p>", "", ltriptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"</p>", "", ltriptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip() #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) - ltriptext = re.sub("</?u>", "_", ltriptext) - ltriptext = re.sub("</?i>", "''", ltriptext) - ltriptext = re.sub("</?b>", "'''", ltriptext) + ltriptext = re.sub(r"</?u>", "_", ltriptext) + ltriptext = re.sub(r"</?i>", "''", ltriptext) + ltriptext = re.sub(r"</?b>", "'''", ltriptext) #print ldate, trippeople.strip() @@ -225,19 +231,19 @@ def Parseloghtml01(year, expedition, txt): def Parseloghtml03(year, expedition, txt): - tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara) assert s, trippara tripheader, triptext = s.group(1), s.group(2) - tripheader = re.sub(" ", " ", tripheader) - tripheader = re.sub("\s+", " ", tripheader).strip() + tripheader = re.sub(r" ", " ", tripheader) + tripheader = re.sub(r"\s+", " ", tripheader).strip() sheader = tripheader.split(" -- ") tu = "" if re.match("T/U|Time underwater", sheader[-1]): tu = sheader.pop() if len(sheader) != 3: - print "header not three pieces", sheader + print("header not three pieces", sheader) tripdate, triptitle, trippeople = sheader ldate = ParseDate(tripdate.strip(), year) triptitles = triptitle.split(" , ") @@ -246,37 +252,12 @@ def Parseloghtml03(year, expedition, txt): else: tripcave = "UNKNOWN" #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) - ltriptext = re.sub("</p>", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() - ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) + ltriptext = re.sub(r"</p>", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -yearlinks = [ -# ("2013", "2013/logbook.html", Parseloghtmltxt), - ("2012", "2012/logbook.html", Parseloghtmltxt), - ("2011", "2011/logbook.html", Parseloghtmltxt), - ("2010", "2010/logbook.html", Parselogwikitxt), - ("2009", "2009/2009logbook.txt", Parselogwikitxt), - ("2008", "2008/2008logbook.txt", Parselogwikitxt), - ("2007", "2007/logbook.html", Parseloghtmltxt), - ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), - ("2005", "2005/logbook.html", Parseloghtmltxt), - ("2004", "2004/logbook.html", Parseloghtmltxt), - ("2003", "2003/logbook.html", Parseloghtml03), - ("2002", "2002/logbook.html", Parseloghtmltxt), - ("2001", "2001/log.htm", Parseloghtml01), - ("2000", "2000/log.htm", Parseloghtml01), - ("1999", "1999/log.htm", Parseloghtml01), - ("1998", "1998/log.htm", Parseloghtml01), - ("1997", "1997/log.htm", Parseloghtml01), - ("1996", "1996/log.htm", Parseloghtml01), - ("1995", "1995/log.htm", Parseloghtml01), - ("1994", "1994/log.htm", Parseloghtml01), - ("1993", "1993/log.htm", Parseloghtml01), - ("1992", "1992/log.htm", Parseloghtml01), - ("1991", "1991/log.htm", Parseloghtml01), - ] def SetDatesFromLogbookEntries(expedition): """ @@ -295,23 +276,41 @@ def SetDatesFromLogbookEntries(expedition): persontrip.persontrip_next = None lprevpersontrip = persontrip persontrip.save() - - - + + def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ expowebbase = os.path.join(settings.EXPOWEB, "years") year = str(expedition.year) - for lyear, lloc, parsefunc in yearlinks: - if lyear == year: - break - fin = open(os.path.join(expowebbase, lloc)) - print "opennning", lloc - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) + yearlinks = settings.LOGBOOK_PARSER_SETTINGS + + logbook_parseable = False + + if expedition.year in yearlinks: + year_settings = yearlinks[expedition.year] + file_in = open(os.path.join(expowebbase, year_settings[0])) + txt = file_in.read().decode("latin1") + file_in.close() + parsefunc = year_settings[1] + logbook_parseable = True + else: + try: + file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + print("No set parser found using default") + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + except (IOError): + logbook_parseable = False + print("Couldn't open default logbook file and nothing set for expo " + expo.year) + + if logbook_parseable: + parser = globals()[parsefunc] + parser(expedition.year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) @@ -324,25 +323,49 @@ def LoadLogbooks(): #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite - for year, lloc, parsefunc in yearlinks: - # This will not work until the corresponding year exists in the database. - # In 2012 this needed noscript/folk.csv to be updated first. - expedition = models.Expedition.objects.filter(year = year)[0] - fin = open(os.path.join(expowebbase, lloc)) - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) + yearlinks = settings.LOGBOOK_PARSER_SETTINGS -dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) -expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S) -titleRegex = re.compile('<H1>(.*?)</H1>', re.S) -reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S) -personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S) -nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S) -TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) -locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S) -caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S) + expos = models.Expedition.objects.all() + for expo in expos: + print("\nLoading Logbook for: " + expo.year) + + logbook_parseable = False + + if expo.year in yearlinks: + #print(yearlinks[expo.year]) + year_settings = yearlinks[expo.year] + file_in = open(os.path.join(expowebbase, year_settings[0])) + txt = file_in.read().decode("latin1") + file_in.close() + parsefunc = year_settings[1] + logbook_parseable = True + else: + try: + file_in = open(os.path.join(expowebbase, expo.year, settings.DEFAULT_LOGBOOK_FILE)) + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + print("No set parser found using default") + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + except (IOError): + logbook_parseable = False + print("Couldn't open default logbook file and nothing in settings for expo " + expo.year) + + if logbook_parseable: + parser = globals()[parsefunc] + parser(expo.year, expo, txt) + SetDatesFromLogbookEntries(expo) + + +dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) +expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) +titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S) +reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S) +personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S) +nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S) +TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) +locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S) +caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S) def parseAutoLogBookEntry(filename): errors = [] @@ -435,4 +458,4 @@ def parseAutoLogBookEntry(filename): time_underground = TU, logbook_entry = logbookEntry, is_logbook_entry_author = author).save() - print logbookEntry + print(logbookEntry) diff --git a/parsers/people.py b/parsers/people.py index bc18472..3c3fc03 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -50,7 +50,7 @@ def LoadPersonsExpos(): header = dict(zip(headers, range(len(headers)))) # make expeditions - print "Loading expeditions" + print("Loading expeditions") years = headers[5:] for year in years: @@ -61,7 +61,7 @@ def LoadPersonsExpos(): # make persons - print "Loading personexpeditions" + print("Loading personexpeditions") #expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",") #expomissing = set(expoers2008) @@ -127,7 +127,7 @@ def GetPersonExpeditionNameLookup(expedition): res = { } duplicates = set() - print "Calculating GetPersonExpeditionNameLookup for", expedition.year + print("Calculating GetPersonExpeditionNameLookup for " + expedition.year) personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition) for personexpedition in personexpeditions: possnames = [ ] |