diff options
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py new file mode 100644 index 0000000..5c38d41 --- /dev/null +++ b/parsers/logbooks.py @@ -0,0 +1,197 @@ +#.-*- coding: utf-8 -*-
+
+import settings
+import expo.models as models
+import csv
+import sqlite3
+import re
+import os
+import datetime
+
+# Dave Johnson (Stonker) is hacked -- are there two of this DJ name
+# Dave Collins (Scout) is hacked
+# Letty ten Harkel has middle , tu = timeug or ""name removed
+# the <span lang=""sv""> have been removed
+# Dave Milne (Lummat)
+# Ben van Millingen
+# Rebecca Lawson (Becka)
+
+persontab = open(os.path.join(settings.EXPOWEB, "noinfo", "folk.csv"))
+personreader = csv.reader(persontab)
+headers = personreader.next()
+header = dict(zip(headers, range(len(headers))))
+
+
+def LoadExpos():
+ models.Expedition.objects.all().delete()
+ y = models.Expedition(year = "2008", name = "CUCC expo2008")
+ y.save()
+ for year in headers[5:]:
+ y = models.Expedition(year = year, name = "CUCC expo%s" % y)
+ y.save()
+
+def LoadPersons():
+ models.Person.objects.all().delete()
+ models.PersonExpedition.objects.all().delete()
+ expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",")
+ expomissing = set(expoers2008)
+
+ for person in personreader:
+ name = person[header["Name"]]
+ name = re.sub("<.*?>", "", name)
+ lname = name.split()
+ if len(lname) >= 2:
+ firstname, lastname = lname[0], lname[1]
+ else:
+ firstname, lastname = lname[0], ""
+ print firstname, lastname
+ #assert lastname == person[header[""]], person
+ pObject = models.Person(first_name = firstname,
+ last_name = lastname,
+ is_guest = person[header["Guest"]] == "1",
+ is_vfho = person[header["VfHO member"]],
+ mug_shot = person[header["Mugshot"]])
+ pObject.save()
+
+ for year, attended in zip(headers, person)[5:]:
+ yo = models.Expedition.objects.filter(year = year)[0]
+ if attended == "1" or attended == "-1":
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+ if name in expoers2008:
+ print "2008:", name
+ expomissing.discard(name)
+ yo = models.Expedition.objects.filter(year = "2008")[0]
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+
+ print expomissing
+ for name in expomissing:
+ firstname, lastname = name.split()
+ pObject = models.Person(first_name = firstname,
+ last_name = lastname,
+ is_guest = name in ["Eeva Makiranta", "Kieth Curtis"],
+ is_vfho = False,
+ mug_shot = "")
+ pObject.save()
+ yo = models.Expedition.objects.filter(year = "2008")[0]
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+
+#
+# the logbook loading section
+#
+def GetTripPersons(trippeople, expedition):
+ res = [ ]
+ author = None
+ for tripperson in re.split(",|\+|&| and ", trippeople):
+ tripperson = tripperson.strip()
+ mul = re.match("<u>(.*?)</u>$", tripperson)
+ if mul:
+ tripperson = mul.group(1)
+ if tripperson and tripperson[0] != '*':
+ #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ personyear = expedition.GetPersonExpedition(tripperson)
+ print personyear
+ res.append(personyear)
+ if mul:
+ author = personyear
+ if not author:
+ author = res[-1]
+ return res, author
+
+def Parselogwikitxt(year, personyearmap, txt):
+ trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+ for triphead, triptext in trippara:
+ tripheadp = triphead.split("|")
+ assert len(tripheadp) == 3, tripheadp
+ tripdate, tripplace, trippeople = tripheadp
+ tripsplace = tripplace.split(" - ")
+ tripcave = tripsplace[0]
+
+ tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+ if tul:
+ #assert len(tul) <= 1, (triphead, triptext)
+ #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
+ triptime = tul[0][0]
+ else:
+ triptime = ""
+ #assert tripcave == "Journey", (triphead, triptext)
+
+ assert re.match("\d\d\d\d-\d\d-\d\d", tripdate), tripdate
+ ldate = datetime.date(int(tripdate[:4]), int(tripdate[5:7]), int(tripdate[8:10]))
+ lbo = models.LogbookEntry(date = ldate, cave = tripcave, title = tripsplace[-1], text = triptext, tu = triptime)
+ lbo.save()
+
+ trippersons, author = GetTripPersons(trippeople, personyearmap)
+ for tripperson in trippersons:
+ lbo.cavers.add(tripperson)
+ # add the author
+
+def Parseloghtmltxt(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match('''(?x)\s*(?:<a\s+id="(.*?)"\s*/>)?
+ \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>
+ \s*<div\s+class="trippeople">(.*?)</div>
+ \s*<div\s+class="triptitle">(.*?)</div>
+ ([\s\S]*?)
+ \s*(?:<div\s+class="timeug">(.*?)</div>)?
+ \s*$
+ ''', trippara)
+ assert s, trippara
+
+ tripid, tripid1, tripdate, trippeople, triptitle, triptext, timeug = s.groups()
+ mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match("(\d\d?)/(\d)/(\d\d)", tripdate)
+ if mdatestandard:
+ year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+ elif mdategoof:
+ day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(3)) + 2000
+ else:
+ assert False, tripdate
+ ldate = datetime.date(year, month, day)
+ #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
+ trippersons, author = GetTripPersons(trippeople, expedition)
+ tripcave = ""
+ lbo = models.LogbookEntry(date = ldate, place = tripcave, title = triptitle, text = triptext, author=author)
+ lbo.save()
+ tu = timeug or ""
+
+ for tripperson in trippersons:
+ pto = models.PersonTrip(personexpedition = tripperson, place=tripcave, date=ldate, timeunderground=tu, logbookentry=lbo)
+ pto.save()
+
+
+
+def LoadLogbooks():
+ models.LogbookEntry.objects.all().delete()
+ expowebbase = os.path.join(settings.EXPOWEB, "years") # this could be a url
+ yearlinks = [
+# ("2008", "2008/logbook/2008logbook.txt"),
+# ("2007", "2007/logbook/2007logbook.txt"),
+# ("2005", "2005/logbook.html"),
+ ("2004", "2004/logbook.html"),
+# ("2003", "2003/logbook.html"),
+ ]
+
+ for year, lloc in yearlinks:
+ expedition = models.Expedition.objects.filter(year = year)[0]
+ fin = open(os.path.join(expowebbase, lloc))
+ txt = fin.read()
+ fin.close()
+ #print personyearmap
+ if year >= "2007":
+ Parselogwikitxt(year, personyearmap, txt)
+ else:
+ Parseloghtmltxt(year, expedition, txt)
+
+# command line run through the loading stages
+LoadExpos()
+LoadPersons()
+LoadLogbooks()
+
+
|