summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authorsubstantialnoninfringinguser <substantialnoninfringinguser@gmail.com>2009-05-13 05:13:38 +0100
committersubstantialnoninfringinguser <substantialnoninfringinguser@gmail.com>2009-05-13 05:13:38 +0100
commitb503d3d588474cc41bffc01eca7654bb8c6f4a42 (patch)
tree782956fc07f18a13ae24fc0c045e970c6ba03f04 /parsers
downloadtroggle-b503d3d588474cc41bffc01eca7654bb8c6f4a42.tar.gz
troggle-b503d3d588474cc41bffc01eca7654bb8c6f4a42.tar.bz2
troggle-b503d3d588474cc41bffc01eca7654bb8c6f4a42.zip
[svn] Initial troggle checkin
This is a development site using Django 1.0 Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8034 by julian @ 10/26/2008 9:04 PM
Diffstat (limited to 'parsers')
-rw-r--r--parsers/__init__.py0
-rw-r--r--parsers/cavetab.py272
-rw-r--r--parsers/logbooks.py197
-rw-r--r--parsers/survex.py31
4 files changed, 500 insertions, 0 deletions
diff --git a/parsers/__init__.py b/parsers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/parsers/__init__.py
diff --git a/parsers/cavetab.py b/parsers/cavetab.py
new file mode 100644
index 0000000..bd3d81b
--- /dev/null
+++ b/parsers/cavetab.py
@@ -0,0 +1,272 @@
+# -*- coding: utf-8 -*-
+
+import settings
+import expo.models as models
+import csv
+import time
+import sqlite3
+import re
+import os
+
+##format of CAVETAB2.CSV is
+KatasterNumber = 0
+KatStatusCode = 1
+Entrances = 2
+UnofficialNumber = 3
+MultipleEntrances = 4
+AutogenFile = 5
+LinkFile = 6
+LinkEntrance = 7
+Name = 8
+UnofficialName = 9
+Comment = 10
+Area = 11
+Explorers = 12
+UndergroundDescription = 13
+Equipment = 14
+QMList = 15
+KatasterStatus = 16
+References = 17
+UndergroundCentreLine = 18
+UndergroundDrawnSurvey = 19
+SurvexFile = 20
+Length = 21
+Depth = 22
+Extent = 23
+Notes = 24
+EntranceName = 25
+TagPoint = 26
+OtherPoint = 27
+DescriptionOfOtherPoint = 28
+ExactEntrance = 29
+TypeOfFix = 30
+GPSpreSA = 31
+GPSpostSA = 32
+Northing = 33
+Easting = 34
+Altitude = 35
+Bearings = 36
+Map = 37
+Location = 38
+Approach = 39
+EntranceDescription = 40
+PhotoOfLocation = 41
+Marking = 42
+MarkingComment = 43
+Findability = 44
+FindabilityComment = 45
+
+cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"))
+caveReader = csv.reader(cavetab)
+caveReader.next() # Strip out column headers
+
+
+def save(x): #There seems to be an intermitent problem with sqlite and Vista, this should fix it
+ try:
+ x.save()
+ except sqlite3.OperationalError:
+ print "Error"
+ time.sleep(1)
+ save(x)
+
+def html_to_wiki(text):
+ if type(text) != str:
+ return text
+ text = unicode(text, "utf-8")
+ #Characters
+ #text = re.sub("&uuml;", u"\xfc", text)
+ #text = re.sub("&ouml;", u"\xf6", text)
+ #text = re.sub("&auml;", u"\xe4", text)
+ #text = re.sub("&deg;", u"\xb0", text)
+ #text = re.sub("&copy;", u"\xa9", text)
+ #text = re.sub("&amp;", u"\x26", text)
+ #text = re.sub("&szlig;", u"\xdf", text)
+ #text = re.sub("&szlig;", u"\xdf", text)
+ #text = re.sub("&lt;", u"<", text)
+ #text = re.sub("&gt;", u">", text)
+ #text = re.sub("&egrave;", u"\xe8", text)
+ #text = re.sub("&eacute;", u"\xe9", text)
+ #text = re.sub("&quote;", u'"', text)
+ #text = re.sub("&quot;", u'"', text)
+ #text = re.sub("&Ouml;", u'\xd6', text)
+ #text = re.sub("&times;", u'"', text)
+
+ #text = re.sub("&(.*);", "/1", text)
+ #if s:
+ # print s.groups()
+ #Lists
+ text = re.sub("^</p>(.*)", r"\1", text)
+ text = re.sub("(.*)<p>$", r"\1", text)
+ out = ""
+ lists = ""
+ while text:
+ mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+ munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
+ mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+ munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
+ mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+ ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
+ def min_(i, l):
+ try:
+ v = i.groups()[0]
+ l.remove(len(v))
+ return len(v) < min(l, 1000000000)
+ except:
+ return False
+ if min_(mstar, ms):
+ lists += "*"
+ pre, val, post = mstar.groups()
+ out += pre + "\n" + lists + " " + val
+ text = post
+ elif min_(mhash, ms):
+ lists += "#"
+ pre, val, post = mhash.groups()
+ out += pre + "\n" + lists + " " + val
+ text = post
+ elif min_(mitem, ms):
+ pre, val, post = mitem.groups()
+ out += "\n" + lists + " " + val
+ text = post
+ elif min_(munstar, ms):
+ lists = lists[:-1]
+ text = munstar.groups()[1]
+ elif min_(munhash, ms):
+ lists.pop()
+ text = munhash.groups()[1]
+ else:
+ out += text
+ text = ""
+ text2 = out
+ while text2:
+ mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
+ if mtag:
+ text2 = mtag.groups()[2]
+ print mtag.groups()[1]
+ else:
+ text2 = ""
+ return out
+
+for katArea in ['1623', '1626']:
+ if not models.Area.objects.filter(short_name = katArea):
+ newArea = models.Area(short_name = katArea)
+ save(newArea)
+area1626 = models.Area.objects.filter(short_name = '1626')[0]
+area1623 = models.Area.objects.filter(short_name = '1623')[0]
+
+counter=0
+for line in caveReader :
+ if line[Area] == 'nonexistent':
+ continue
+ entranceLetters=[] #Used in caves that have mulitlple entrances, which are not described on seperate lines
+ if line[MultipleEntrances] == 'yes' or line[MultipleEntrances]=='':
+ args = {}
+ def addToArgs(CSVname, modelName):
+ if line[CSVname]:
+ args[modelName] = html_to_wiki(line[CSVname])
+ addToArgs(KatasterNumber, "kataster_number")
+ addToArgs(KatStatusCode, "kataster_code")
+ addToArgs(UnofficialNumber, "unofficial_number")
+ addToArgs(Name, "official_name")
+ addToArgs(Comment, "notes")
+ addToArgs(Explorers, "explorers")
+ addToArgs(UndergroundDescription, "underground_description")
+ addToArgs(Equipment, "equipment")
+ addToArgs(KatasterStatus, "kataster_status")
+ addToArgs(References, "references")
+ addToArgs(UndergroundCentreLine, "underground_centre_line")
+ addToArgs(UndergroundDrawnSurvey, "survey")
+ addToArgs(Length, "length")
+ addToArgs(Depth, "depth")
+ addToArgs(Extent, "extent")
+ addToArgs(SurvexFile, "survex_file")
+ addToArgs(Notes, "notes")
+
+ newCave = models.Cave(**args)
+ save(newCave)
+
+ if line[Area]:
+ if line[Area] == "1626":
+ newCave.area.add(area1626)
+ else:
+ area = models.Area.objects.filter(short_name = line[Area])
+ if area:
+ newArea = area[0]
+ else:
+ newArea = models.Area(short_name = line[Area], parent = area1623)
+ save(newArea)
+ newCave.area.add(newArea)
+ else:
+ newCave.area.add(area1623)
+
+ save(newCave)
+
+ if line[UnofficialName]:
+ newUnofficialName = models.OtherCaveName(cave = newCave, name = line[UnofficialName])
+ save(newUnofficialName)
+ if line[MultipleEntrances] == '' or \
+ line[MultipleEntrances] == 'entrance' or \
+ line[MultipleEntrances] == 'last entrance':
+ args = {}
+ def addToArgs(CSVname, modelName):
+ if line[CSVname]:
+ args[modelName] = html_to_wiki(line[CSVname])
+ def addToArgsViaDict(CSVname, modelName, dictionary):
+ if line[CSVname]:
+ args[modelName] = dictionary[html_to_wiki(line[CSVname])]
+ addToArgs(EntranceName, 'name')
+ addToArgs(Explorers, 'explorers')
+ addToArgs(Map, 'map_description')
+ addToArgs(Location, 'location_description')
+ addToArgs(Approach, 'approach')
+ addToArgs(EntranceDescription, 'entrance_description')
+ addToArgs(UndergroundDescription, 'underground_description')
+ addToArgs(PhotoOfLocation, 'photo')
+ addToArgsViaDict(Marking, 'marking', {"Paint": "P",
+ "Paint (?)": "P?",
+ "Tag": "T",
+ "Tag (?)": "T?",
+ "Retagged": "R",
+ "Retag": "R",
+ "Spit": "S",
+ "Spit (?)": "S?",
+ "Unmarked": "U",
+ "": "?",
+ })
+ addToArgs(MarkingComment, 'marking_comment')
+ addToArgsViaDict(Findability, 'findability', {"Surveyed": "S",
+ "Lost": "L",
+ "Refindable": "R",
+ "": "?",
+ "?": "?",
+ })
+ addToArgs(FindabilityComment, 'findability_description')
+ addToArgs(Easting, 'easting')
+ addToArgs(Northing, 'northing')
+ addToArgs(Altitude, 'alt')
+ addToArgs(DescriptionOfOtherPoint, 'other_description')
+ def addToArgsSurveyStation(CSVname, modelName):
+ if line[CSVname]:
+ surveyPoint = models.SurveyStation(name = line[CSVname])
+ save(surveyPoint)
+ args[modelName] = html_to_wiki(surveyPoint)
+ addToArgsSurveyStation(TagPoint, 'tag_station')
+ addToArgsSurveyStation(ExactEntrance, 'exact_station')
+ addToArgsSurveyStation(OtherPoint, 'other_station')
+ addToArgs(OtherPoint, 'other_description')
+ if line[GPSpreSA]:
+ addToArgsSurveyStation(GPSpreSA, 'other_station')
+ args['other_description'] = 'pre selective availability GPS'
+ if line[GPSpostSA]:
+ addToArgsSurveyStation(GPSpostSA, 'other_station')
+ args['other_description'] = 'post selective availability GPS'
+ addToArgs(Bearings, 'bearings')
+ newEntrance = models.Entrance(**args)
+ save(newEntrance)
+
+ if line[Entrances]:
+ entrance_letter = line[Entrances]
+ else:
+ entrance_letter = ''
+
+ newCaveAndEntrance = models.CaveAndEntrance(cave = newCave, entrance = newEntrance, entrance_letter = entrance_letter)
+ save(newCaveAndEntrance) \ No newline at end of file
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
new file mode 100644
index 0000000..5c38d41
--- /dev/null
+++ b/parsers/logbooks.py
@@ -0,0 +1,197 @@
+#.-*- coding: utf-8 -*-
+
+import settings
+import expo.models as models
+import csv
+import sqlite3
+import re
+import os
+import datetime
+
+# Dave Johnson (Stonker) is hacked -- are there two of this DJ name
+# Dave Collins (Scout) is hacked
+# Letty ten Harkel has middle , tu = timeug or ""name removed
+# the <span lang=""sv""> have been removed
+# Dave Milne (Lummat)
+# Ben van Millingen
+# Rebecca Lawson (Becka)
+
+persontab = open(os.path.join(settings.EXPOWEB, "noinfo", "folk.csv"))
+personreader = csv.reader(persontab)
+headers = personreader.next()
+header = dict(zip(headers, range(len(headers))))
+
+
+def LoadExpos():
+ models.Expedition.objects.all().delete()
+ y = models.Expedition(year = "2008", name = "CUCC expo2008")
+ y.save()
+ for year in headers[5:]:
+ y = models.Expedition(year = year, name = "CUCC expo%s" % y)
+ y.save()
+
+def LoadPersons():
+ models.Person.objects.all().delete()
+ models.PersonExpedition.objects.all().delete()
+ expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",")
+ expomissing = set(expoers2008)
+
+ for person in personreader:
+ name = person[header["Name"]]
+ name = re.sub("<.*?>", "", name)
+ lname = name.split()
+ if len(lname) >= 2:
+ firstname, lastname = lname[0], lname[1]
+ else:
+ firstname, lastname = lname[0], ""
+ print firstname, lastname
+ #assert lastname == person[header[""]], person
+ pObject = models.Person(first_name = firstname,
+ last_name = lastname,
+ is_guest = person[header["Guest"]] == "1",
+ is_vfho = person[header["VfHO member"]],
+ mug_shot = person[header["Mugshot"]])
+ pObject.save()
+
+ for year, attended in zip(headers, person)[5:]:
+ yo = models.Expedition.objects.filter(year = year)[0]
+ if attended == "1" or attended == "-1":
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+ if name in expoers2008:
+ print "2008:", name
+ expomissing.discard(name)
+ yo = models.Expedition.objects.filter(year = "2008")[0]
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+
+ print expomissing
+ for name in expomissing:
+ firstname, lastname = name.split()
+ pObject = models.Person(first_name = firstname,
+ last_name = lastname,
+ is_guest = name in ["Eeva Makiranta", "Kieth Curtis"],
+ is_vfho = False,
+ mug_shot = "")
+ pObject.save()
+ yo = models.Expedition.objects.filter(year = "2008")[0]
+ pyo = models.PersonExpedition(person = pObject, expedition = yo)
+ pyo.save()
+
+
+#
+# the logbook loading section
+#
+def GetTripPersons(trippeople, expedition):
+ res = [ ]
+ author = None
+ for tripperson in re.split(",|\+|&| and ", trippeople):
+ tripperson = tripperson.strip()
+ mul = re.match("<u>(.*?)</u>$", tripperson)
+ if mul:
+ tripperson = mul.group(1)
+ if tripperson and tripperson[0] != '*':
+ #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ personyear = expedition.GetPersonExpedition(tripperson)
+ print personyear
+ res.append(personyear)
+ if mul:
+ author = personyear
+ if not author:
+ author = res[-1]
+ return res, author
+
+def Parselogwikitxt(year, personyearmap, txt):
+ trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+ for triphead, triptext in trippara:
+ tripheadp = triphead.split("|")
+ assert len(tripheadp) == 3, tripheadp
+ tripdate, tripplace, trippeople = tripheadp
+ tripsplace = tripplace.split(" - ")
+ tripcave = tripsplace[0]
+
+ tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+ if tul:
+ #assert len(tul) <= 1, (triphead, triptext)
+ #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
+ triptime = tul[0][0]
+ else:
+ triptime = ""
+ #assert tripcave == "Journey", (triphead, triptext)
+
+ assert re.match("\d\d\d\d-\d\d-\d\d", tripdate), tripdate
+ ldate = datetime.date(int(tripdate[:4]), int(tripdate[5:7]), int(tripdate[8:10]))
+ lbo = models.LogbookEntry(date = ldate, cave = tripcave, title = tripsplace[-1], text = triptext, tu = triptime)
+ lbo.save()
+
+ trippersons, author = GetTripPersons(trippeople, personyearmap)
+ for tripperson in trippersons:
+ lbo.cavers.add(tripperson)
+ # add the author
+
+def Parseloghtmltxt(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match('''(?x)\s*(?:<a\s+id="(.*?)"\s*/>)?
+ \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>
+ \s*<div\s+class="trippeople">(.*?)</div>
+ \s*<div\s+class="triptitle">(.*?)</div>
+ ([\s\S]*?)
+ \s*(?:<div\s+class="timeug">(.*?)</div>)?
+ \s*$
+ ''', trippara)
+ assert s, trippara
+
+ tripid, tripid1, tripdate, trippeople, triptitle, triptext, timeug = s.groups()
+ mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match("(\d\d?)/(\d)/(\d\d)", tripdate)
+ if mdatestandard:
+ year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+ elif mdategoof:
+ day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(3)) + 2000
+ else:
+ assert False, tripdate
+ ldate = datetime.date(year, month, day)
+ #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
+ trippersons, author = GetTripPersons(trippeople, expedition)
+ tripcave = ""
+ lbo = models.LogbookEntry(date = ldate, place = tripcave, title = triptitle, text = triptext, author=author)
+ lbo.save()
+ tu = timeug or ""
+
+ for tripperson in trippersons:
+ pto = models.PersonTrip(personexpedition = tripperson, place=tripcave, date=ldate, timeunderground=tu, logbookentry=lbo)
+ pto.save()
+
+
+
+def LoadLogbooks():
+ models.LogbookEntry.objects.all().delete()
+ expowebbase = os.path.join(settings.EXPOWEB, "years") # this could be a url
+ yearlinks = [
+# ("2008", "2008/logbook/2008logbook.txt"),
+# ("2007", "2007/logbook/2007logbook.txt"),
+# ("2005", "2005/logbook.html"),
+ ("2004", "2004/logbook.html"),
+# ("2003", "2003/logbook.html"),
+ ]
+
+ for year, lloc in yearlinks:
+ expedition = models.Expedition.objects.filter(year = year)[0]
+ fin = open(os.path.join(expowebbase, lloc))
+ txt = fin.read()
+ fin.close()
+ #print personyearmap
+ if year >= "2007":
+ Parselogwikitxt(year, personyearmap, txt)
+ else:
+ Parseloghtmltxt(year, expedition, txt)
+
+# command line run through the loading stages
+LoadExpos()
+LoadPersons()
+LoadLogbooks()
+
+
diff --git a/parsers/survex.py b/parsers/survex.py
new file mode 100644
index 0000000..0f75e06
--- /dev/null
+++ b/parsers/survex.py
@@ -0,0 +1,31 @@
+import settings
+import expo.models as models
+import re
+import os
+
+def readFile(filename):
+ for line in fileIterator(settings.SURVEX_DATA, filename):
+ print line
+
+re_include_extension = re.compile(r"^\s*\*include\s+([^\s]*).svx$", re.IGNORECASE)
+re_include_no_extension = re.compile(r"^\s*\*include\s+([^\s]*)$", re.IGNORECASE)
+
+def fileIterator(directory, filename):
+ f = open(os.path.join(directory, filename + ".svx"), "rb")
+ for line in f.readlines():
+ include_extension = re_include_extension.match(line)
+ include_no_extension = re_include_no_extension.match(line)
+ def a(include):
+ link = re.split(r"/|\\", include)
+ print os.path.join(directory, *link[:-1]), link[-1]
+ return fileIterator(os.path.join(directory, *link[:-1]), link[-1])
+ if include_extension:
+ for b in a(include_extension.groups()[0]):
+ yield b
+ elif include_no_extension:
+ for b in a(include_no_extension.groups()[0]):
+ yield b
+ else:
+ yield line
+
+readFile("all") \ No newline at end of file