diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/logbooks.py | 174 |
1 files changed, 88 insertions, 86 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index c9e1651..900022f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -30,15 +30,15 @@ def LoadPersons(): for person in personreader:
name = person[header["Name"]]
name = re.sub("<.*?>", "", name)
- mname = re.match("(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name) + mname = re.match("(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name)
if mname.group(3):
nickname = mname.group(3)
else:
nickname = ""
- firstname, lastname = mname.group(1), mname.group(2) or "" - + firstname, lastname = mname.group(1), mname.group(2) or ""
+
#print firstname, lastname, "NNN", nickname
#assert lastname == person[header[""]], person
@@ -53,14 +53,14 @@ def LoadPersons(): yo = models.Expedition.objects.filter(year = year)[0]
if attended == "1" or attended == "-1":
pyo = models.PersonExpedition(person = pObject, expedition = yo, nickname=nickname, is_guest=is_guest)
- pyo.save() - - # error - elif (firstname, lastname) == ("Mike", "Richardson") and year == "2001": - print "Mike Richardson(2001) error" + pyo.save()
+
+ # error
+ elif (firstname, lastname) == ("Mike", "Richardson") and year == "2001":
+ print "Mike Richardson(2001) error"
pyo = models.PersonExpedition(person = pObject, expedition = yo, nickname=nickname, is_guest=is_guest)
- pyo.save() - + pyo.save()
+
if name in expoers2008:
print "2008:", name
@@ -98,8 +98,8 @@ def GetTripPersons(trippeople, expedition): if tripperson and tripperson[0] != '*':
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = expedition.GetPersonExpedition(tripperson)
- if not personyear: - print "NoMatchFor: '%s'" % tripperson + if not personyear:
+ print "NoMatchFor: '%s'" % tripperson
res.append(personyear)
if mul:
author = personyear
@@ -107,31 +107,31 @@ def GetTripPersons(trippeople, expedition): author = res[-1]
return res, author
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, tu): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, tu):
trippersons, author = GetTripPersons(trippeople, expedition)
lbo = models.LogbookEntry(date=date, place=place, title=title[:50], text=text, author=author)
- lbo.save() + lbo.save()
print "ttt", date, place
for tripperson in trippersons:
- pto = models.PersonTrip(personexpedition = tripperson, place=place, date=date, timeunderground=(tu or ""),
- logbookentry=lbo, is_logbookentryauthor=(tripperson == author))
+ pto = models.PersonTrip(person_expedition = tripperson, place=place, date=date, time_underground=(tu or ""),
+ logbook_entry=lbo, is_logbook_entry_author=(tripperson == author))
pto.save()
- -def ParseDate(tripdate, year): +
+def ParseDate(tripdate, year):
mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
- assert mdatestandard.group(1) == year, (tripdate, year) + assert mdatestandard.group(1) == year, (tripdate, year)
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
- elif mdategoof: - assert not mdategoof.group(3) or mdategoof.group(3) == year[:2] + elif mdategoof:
+ assert not mdategoof.group(3) or mdategoof.group(3) == year[:2]
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
- assert False, tripdate + assert False, tripdate
return datetime.date(year, month, day)
- -# 2007, 2008, 2006 +
+# 2007, 2008, 2006
def Parselogwikitxt(year, expedition, txt):
trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
@@ -150,11 +150,11 @@ def Parselogwikitxt(year, expedition, txt): tu = ""
#assert tripcave == "Journey", (triphead, triptext)
- ldate = ParseDate(tripdate.strip(), year) + ldate = ParseDate(tripdate.strip(), year)
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, tu=tu) + EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, tu=tu)
-# 2002, 2004, 2005 +# 2002, 2004, 2005
def Parseloghtmltxt(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
@@ -169,85 +169,86 @@ def Parseloghtmltxt(year, expedition, txt): assert s, trippara
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
- ldate = ParseDate(tripdate.strip(), year) - #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) - trippeople = re.sub("Ol(?!l)", "Olly", trippeople) - trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) - triptitles = triptitle.split(" - ") - if len(triptitles) >= 2: - tripcave = triptitles[0] - else: + ldate = ParseDate(tripdate.strip(), year)
+ #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
+ trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
+ trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
+ triptitles = triptitle.split(" - ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
tripcave = "UNKNOWN"
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu) + ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu)
+
- -# main parser for pre-2001. simpler because the data has been hacked so much to fit it +# main parser for pre-2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
- assert s, trippara[:100] + assert s, trippara[:100]
tripheader, triptext = s.group(1), s.group(2)
- mtripid = re.search('<a id="(.*?)"', tripheader) - tripid = mtripid and mtripid.group(1) or "" - tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader) - - #print [tripheader] - #continue - + mtripid = re.search('<a id="(.*?)"', tripheader)
+ tripid = mtripid and mtripid.group(1) or ""
+ tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
+
+ #print [tripheader]
+ #continue
+
tripdate, triptitle, trippeople = tripheader.split("|")
- ldate = ParseDate(tripdate.strip(), year) - - mtu = re.search('<p[^>]*>(T/?U.*)', triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - + ldate = ParseDate(tripdate.strip(), year)
+
+ mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
+ if mtu:
+ tu = mtu.group(1)
+ triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
+ else:
+ tu = ""
+
+ triptitles = triptitle.split(" - ")
+ tripcave = triptitles[0].strip()
+
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) - - #print ldate, trippeople.strip() - # could includ the tripid (url link for cross referencing) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu) - + #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
+
+ #print ldate, trippeople.strip()
+ # could includ the tripid (url link for cross referencing)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu)
+
def Parseloghtml03(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
- assert s, trippara + assert s, trippara
tripheader, triptext = s.group(1), s.group(2)
- tripheader = re.sub(" ", " ", tripheader) - tripheader = re.sub("\s+", " ", tripheader).strip() - sheader = tripheader.split(" -- ") - tu = "" - if re.match("T/U|Time underwater", sheader[-1]): - tu = sheader.pop() - if len(sheader) != 3: - print sheader - # continue - tripdate, triptitle, trippeople = sheader - ldate = ParseDate(tripdate.strip(), year) - triptitles = triptitle.split(" , ") - if len(triptitles) >= 2: - tripcave = triptitles[0] - else: + tripheader = re.sub(" ", " ", tripheader)
+ tripheader = re.sub("\s+", " ", tripheader).strip()
+ sheader = tripheader.split(" -- ")
+ tu = ""
+ if re.match("T/U|Time underwater", sheader[-1]):
+ tu = sheader.pop()
+ if len(sheader) != 3:
+ print sheader
+ # continue
+ tripdate, triptitle, trippeople = sheader
+ ldate = ParseDate(tripdate.strip(), year)
+ triptitles = triptitle.split(" , ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() - ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu) + ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, tu=tu)
def LoadLogbooks():
models.LogbookEntry.objects.all().delete()
@@ -272,12 +273,13 @@ def LoadLogbooks(): expedition = models.Expedition.objects.filter(year = year)[0]
fin = open(os.path.join(expowebbase, lloc))
txt = fin.read()
- fin.close() - parsefunc(year, expedition, txt) + fin.close()
+ parsefunc(year, expedition, txt)
# command line run through the loading stages
# you can comment out these in turn to control what gets reloaded
-LoadExpos()
LoadPersons()
+LoadExpos()
+LoadPersons()
LoadLogbooks()
|