summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authorWookey <wookey@wookware.org>2011-07-11 23:28:23 +0100
committerWookey <wookey@wookware.org>2011-07-11 23:28:23 +0100
commitded3d58da16a609ce49fa393b70a93acd22a9d1e (patch)
tree24de35f27ab4783629bee9a8424540cecd01b728 /parsers
parent3b028661f627227d7325c65adc134c3831e854d3 (diff)
parentb6a1503c7a00a582fa08cb5cfb97490f8bfa07aa (diff)
downloadtroggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.gz
troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.tar.bz2
troggle-ded3d58da16a609ce49fa393b70a93acd22a9d1e.zip
rest of martin's changes, without reverting lineend issues
Diffstat (limited to 'parsers')
-rw-r--r--parsers/cavetab.py507
-rw-r--r--parsers/descriptions.py88
-rw-r--r--parsers/logbooks.py864
-rw-r--r--parsers/subcaves.py112
-rw-r--r--parsers/surveys.py602
5 files changed, 1086 insertions, 1087 deletions
diff --git a/parsers/cavetab.py b/parsers/cavetab.py
index 814b3a0..d76a280 100644
--- a/parsers/cavetab.py
+++ b/parsers/cavetab.py
@@ -1,254 +1,253 @@
-# -*- coding: utf-8 -*-
-import troggle.core.models as models
-from django.conf import settings
-import csv, time, re, os, logging
-from utils import save_carefully
-from django.core.urlresolvers import reverse
-import flatpages.models
-
-##format of CAVETAB2.CSV is
-KatasterNumber = 0
-KatStatusCode = 1
-Entrances = 2
-UnofficialNumber = 3
-MultipleEntrances = 4
-AutogenFile = 5
-LinkFile = 6
-LinkEntrance = 7
-Name = 8
-UnofficialName = 9
-Comment = 10
-Area = 11
-Explorers = 12
-UndergroundDescription = 13
-Equipment = 14
-QMList = 15
-KatasterStatus = 16
-References = 17
-UndergroundCentreLine = 18
-UndergroundDrawnSurvey = 19
-SurvexFile = 20
-Length = 21
-Depth = 22
-Extent = 23
-Notes = 24
-EntranceName = 25
-TagPoint = 26
-OtherPoint = 27
-DescriptionOfOtherPoint = 28
-ExactEntrance = 29
-TypeOfFix = 30
-GPSpreSA = 31
-GPSpostSA = 32
-Northing = 33
-Easting = 34
-Altitude = 35
-Bearings = 36
-Map = 37
-Location = 38
-Approach = 39
-EntranceDescription = 40
-PhotoOfLocation = 41
-Marking = 42
-MarkingComment = 43
-Findability = 44
-FindabilityComment = 45
-
-def LoadCaveTab():
-
- cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
- caveReader = csv.reader(cavetab)
- caveReader.next() # Strip out column headers
-
- logging.info("Beginning to import caves from "+str(cavetab)+"\n"+"-"*60+"\n")
-
- for katArea in ['1623', '1626']:
- if not models.Area.objects.filter(short_name = katArea):
- newArea = models.Area(short_name = katArea)
- newArea.save()
- logging.info("Added area "+str(newArea.short_name)+"\n")
- area1626 = models.Area.objects.filter(short_name = '1626')[0]
- area1623 = models.Area.objects.filter(short_name = '1623')[0]
-
- counter=0
- for line in caveReader :
- if line[Area] == 'nonexistent':
- continue
- entranceLetters=[] #Used in caves that have mulitlple entrances, which are not described on seperate lines
- if line[MultipleEntrances] == 'yes' or line[MultipleEntrances]=='': #When true, this line contains an actual cave, otherwise it is an extra entrance.
- args = {}
- defaultArgs = {}
-
- def addToArgs(CSVname, modelName):
- if line[CSVname]:
- args[modelName] = line[CSVname]
-
- def addToDefaultArgs(CSVname, modelName): #This has to do with the non-destructive import. These arguments will be passed as the "default" dictionary in a get_or_create
- if line[CSVname]:
- defaultArgs[modelName] = line[CSVname]
-
- # The attributes added using "addToArgs" will be used to look up an existing cave. Those added using "addToDefaultArgs" will not.
- addToArgs(KatasterNumber, "kataster_number")
- addToDefaultArgs(KatStatusCode, "kataster_code")
- addToArgs(UnofficialNumber, "unofficial_number")
- addToArgs(Name, "official_name")
- addToDefaultArgs(Comment, "notes")
- addToDefaultArgs(Explorers, "explorers")
- addToDefaultArgs(UndergroundDescription, "underground_description")
- addToDefaultArgs(Equipment, "equipment")
- addToDefaultArgs(KatasterStatus, "kataster_status")
- addToDefaultArgs(References, "references")
- addToDefaultArgs(UndergroundCentreLine, "underground_centre_line")
- addToDefaultArgs(UndergroundDrawnSurvey, "survey")
- addToDefaultArgs(Length, "length")
- addToDefaultArgs(Depth, "depth")
- addToDefaultArgs(Extent, "extent")
- addToDefaultArgs(SurvexFile, "survex_file")
- addToDefaultArgs(Notes, "notes")
- addToDefaultArgs(AutogenFile, "url")
- if line[Area] == "1626":
- if line[KatasterNumber] != "":
- args["slug"] = line[Area] + "-" + line[KatasterNumber]
- else:
- args["slug"] = line[Area] + "-" + line[UnofficialNumber]
- else:
- if line[KatasterNumber] != "":
- args["slug"] = "1623" + "-" + line[KatasterNumber]
- else:
- args["slug"] = "1623" + "-" + line[UnofficialNumber]
- #The following adds the legacy_file_path. This is always in either Autogen file or Link file
- for header in (AutogenFile,LinkFile):
- if line[header]:
- addToDefaultArgs(header,"description_file")
- break
-
-
- #The following checks if this cave is non-public i.e. we don't have rights to display it online.
- #Noinfo was the name of the old password protected directory, so if it has that then we will
- #set the non_public field of the model instance to true.
- defaultArgs["non_public"]=line[AutogenFile].startswith('noinfo') or line[LinkFile].startswith('noinfo')
-
- newCave, created=save_carefully(models.Cave, lookupAttribs=args, nonLookupAttribs=defaultArgs)
- logging.info("Added cave "+str(newCave)+"\n")
-
- #If we created a new cave, add the area to it. This does mean that if a cave's identifying features have not changed, areas will not be updated from csv.
- if created and line[Area]:
- if line[Area] == "1626":
- newCave.area.add(area1626)
- else:
- area = models.Area.objects.filter(short_name = line[Area])
- if area:
- newArea = area[0]
- else:
- newArea = models.Area(short_name = line[Area], parent = area1623)
- newArea.save()
- newCave.area.add(newArea)
- newCave.area.add(area1623)
- elif created:
- newCave.area.add(area1623)
-
- newCave.save()
-
- logging.info("Added area "+line[Area]+" to cave "+str(newCave)+"\n")
-
- if created and line[UnofficialName]:
- newUnofficialName = models.OtherCaveName(cave = newCave, name = line[UnofficialName])
- newUnofficialName.save()
-
- logging.info("Added unofficial name "+str(newUnofficialName)+" to cave "+str(newCave)+"\n")
-
-
- if created and line[MultipleEntrances] == '' or \
- line[MultipleEntrances] == 'entrance' or \
- line[MultipleEntrances] == 'last entrance':
- args = {}
-
- if line[Entrances]:
- entrance_letter = line[Entrances]
- else:
- entrance_letter = ''
-
- def addToArgs(CSVname, modelName):
- if line[CSVname]:
- args[modelName] = line[CSVname]
- def addToArgsViaDict(CSVname, modelName, dictionary):
- if line[CSVname]:
- args[modelName] = dictionary[line[CSVname]]
- addToArgs(EntranceName, 'name')
- addToArgs(Explorers, 'explorers')
- addToArgs(Map, 'map_description')
- addToArgs(Location, 'location_description')
- addToArgs(Approach, 'approach')
- addToArgs(EntranceDescription, 'entrance_description')
- addToArgs(UndergroundDescription, 'underground_description')
- addToArgs(PhotoOfLocation, 'photo')
- addToArgsViaDict(Marking, 'marking', {"Paint": "P",
- "Paint (?)": "P?",
- "Tag": "T",
- "Tag (?)": "T?",
- "Retagged": "R",
- "Retag": "R",
- "Spit": "S",
- "Spit (?)": "S?",
- "Unmarked": "U",
- "": "?",
- })
-
- addToArgs(MarkingComment, 'marking_comment')
- addToArgsViaDict(Findability, 'findability', {"Surveyed": "S",
- "Lost": "L",
- "Refindable": "R",
- "": "?",
- "?": "?",
- })
- addToArgs(FindabilityComment, 'findability_description')
- addToArgs(Easting, 'easting')
- addToArgs(Northing, 'northing')
- addToArgs(Altitude, 'alt')
- addToArgs(DescriptionOfOtherPoint, 'other_description')
- addToArgs(TagPoint, 'tag_station')
- addToArgs(ExactEntrance, 'exact_station')
- addToArgs(OtherPoint, 'other_station')
- addToArgs(OtherPoint, 'other_description')
- if line[GPSpreSA]:
- addToArgs(GPSpreSA, 'other_station')
- args['other_description'] = 'pre selective availability GPS'
- if line[GPSpostSA]:
- addToArgs(GPSpostSA, 'other_station')
- args['other_description'] = 'post selective availability GPS'
- addToArgs(Bearings, 'bearings')
- args['slug'] = newCave.slug + entrance_letter
- newEntrance = models.Entrance(**args)
- newEntrance.save()
-
- logging.info("Added entrance "+str(newEntrance)+"\n")
-
-
- newCaveAndEntrance = models.CaveAndEntrance(cave = newCave, entrance = newEntrance, entrance_letter = entrance_letter)
- newCaveAndEntrance.save()
-
- logging.info("Added CaveAndEntrance "+str(newCaveAndEntrance)+"\n")
- if line[AutogenFile] != "":
- f = flatpages.models.EntranceRedirect(originalURL = line[AutogenFile], entrance = newEntrance)
- f.save()
-
-
-# lookup function modelled on GetPersonExpeditionNameLookup
-Gcavelookup = None
-def GetCaveLookup():
- global Gcavelookup
- if Gcavelookup:
- return Gcavelookup
- Gcavelookup = {"NONEPLACEHOLDER":None}
- for cave in models.Cave.objects.all():
- Gcavelookup[cave.official_name.lower()] = cave
- if cave.kataster_number:
- Gcavelookup[cave.kataster_number] = cave
- if cave.unofficial_number:
- Gcavelookup[cave.unofficial_number] = cave
-
- Gcavelookup["tunnocks"] = Gcavelookup["258"]
- Gcavelookup["hauchhole"] = Gcavelookup["234"]
- return Gcavelookup
-
-
+# -*- coding: utf-8 -*-
+import troggle.core.models as models
+from django.conf import settings
+import csv, time, re, os, logging
+from utils import save_carefully
+from django.core.urlresolvers import reverse
+import flatpages.models
+
+##format of CAVETAB2.CSV is
+KatasterNumber = 0
+KatStatusCode = 1
+Entrances = 2
+UnofficialNumber = 3
+MultipleEntrances = 4
+AutogenFile = 5
+LinkFile = 6
+LinkEntrance = 7
+Name = 8
+UnofficialName = 9
+Comment = 10
+Area = 11
+Explorers = 12
+UndergroundDescription = 13
+Equipment = 14
+QMList = 15
+KatasterStatus = 16
+References = 17
+UndergroundCentreLine = 18
+UndergroundDrawnSurvey = 19
+SurvexFile = 20
+Length = 21
+Depth = 22
+Extent = 23
+Notes = 24
+EntranceName = 25
+TagPoint = 26
+OtherPoint = 27
+DescriptionOfOtherPoint = 28
+ExactEntrance = 29
+TypeOfFix = 30
+GPSpreSA = 31
+GPSpostSA = 32
+Northing = 33
+Easting = 34
+Altitude = 35
+Bearings = 36
+Map = 37
+Location = 38
+Approach = 39
+EntranceDescription = 40
+PhotoOfLocation = 41
+Marking = 42
+MarkingComment = 43
+Findability = 44
+FindabilityComment = 45
+
+def LoadCaveTab():
+
+ cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
+ caveReader = csv.reader(cavetab)
+ caveReader.next() # Strip out column headers
+
+ logging.info("Beginning to import caves from "+str(cavetab)+"\n"+"-"*60+"\n")
+
+ for katArea in ['1623', '1626']:
+ if not models.Area.objects.filter(short_name = katArea):
+ newArea = models.Area(short_name = katArea)
+ newArea.save()
+ logging.info("Added area "+str(newArea.short_name)+"\n")
+ area1626 = models.Area.objects.filter(short_name = '1626')[0]
+ area1623 = models.Area.objects.filter(short_name = '1623')[0]
+
+ counter=0
+ for line in caveReader :
+ if line[Area] == 'nonexistent':
+ continue
+ entranceLetters=[] #Used in caves that have mulitlple entrances, which are not described on seperate lines
+ if line[MultipleEntrances] == 'yes' or line[MultipleEntrances]=='': #When true, this line contains an actual cave, otherwise it is an extra entrance.
+ args = {}
+ defaultArgs = {}
+
+ def addToArgs(CSVname, modelName):
+ if line[CSVname]:
+ args[modelName] = line[CSVname]
+
+ def addToDefaultArgs(CSVname, modelName): #This has to do with the non-destructive import. These arguments will be passed as the "default" dictionary in a get_or_create
+ if line[CSVname]:
+ defaultArgs[modelName] = line[CSVname]
+
+ # The attributes added using "addToArgs" will be used to look up an existing cave. Those added using "addToDefaultArgs" will not.
+ addToArgs(KatasterNumber, "kataster_number")
+ addToDefaultArgs(KatStatusCode, "kataster_code")
+ addToArgs(UnofficialNumber, "unofficial_number")
+ addToArgs(Name, "official_name")
+ addToDefaultArgs(Comment, "notes")
+ addToDefaultArgs(Explorers, "explorers")
+ addToDefaultArgs(UndergroundDescription, "underground_description")
+ addToDefaultArgs(Equipment, "equipment")
+ addToDefaultArgs(KatasterStatus, "kataster_status")
+ addToDefaultArgs(References, "references")
+ addToDefaultArgs(UndergroundCentreLine, "underground_centre_line")
+ addToDefaultArgs(UndergroundDrawnSurvey, "survey")
+ addToDefaultArgs(Length, "length")
+ addToDefaultArgs(Depth, "depth")
+ addToDefaultArgs(Extent, "extent")
+ addToDefaultArgs(SurvexFile, "survex_file")
+ addToDefaultArgs(Notes, "notes")
+ addToDefaultArgs(AutogenFile, "url")
+ if line[Area] == "1626":
+ if line[KatasterNumber] != "":
+ args["slug"] = line[Area] + "-" + line[KatasterNumber]
+ else:
+ args["slug"] = line[Area] + "-" + line[UnofficialNumber]
+ else:
+ if line[KatasterNumber] != "":
+ args["slug"] = "1623" + "-" + line[KatasterNumber]
+ else:
+ args["slug"] = "1623" + "-" + line[UnofficialNumber]
+ #The following adds the legacy_file_path. This is always in either Autogen file or Link file
+ for header in (AutogenFile,LinkFile):
+ if line[header]:
+ addToDefaultArgs(header,"description_file")
+ break
+
+ #The following checks if this cave is non-public i.e. we don't have rights to display it online.
+ #Noinfo was the name of the old password protected directory, so if it has that then we will
+ #set the non_public field of the model instance to true.
+ defaultArgs["non_public"]=line[AutogenFile].startswith('noinfo') or line[LinkFile].startswith('noinfo')
+
+ newCave, created=save_carefully(models.Cave, lookupAttribs=args, nonLookupAttribs=defaultArgs)
+ logging.info("Added cave "+str(newCave)+"\n")
+
+ #If we created a new cave, add the area to it. This does mean that if a cave's identifying features have not changed, areas will not be updated from csv.
+ if created and line[Area]:
+ if line[Area] == "1626":
+ newCave.area.add(area1626)
+ else:
+ area = models.Area.objects.filter(short_name = line[Area])
+ if area:
+ newArea = area[0]
+ else:
+ newArea = models.Area(short_name = line[Area], parent = area1623)
+ newArea.save()
+ newCave.area.add(newArea)
+ newCave.area.add(area1623)
+ elif created:
+ newCave.area.add(area1623)
+
+ newCave.save()
+
+ logging.info("Added area "+line[Area]+" to cave "+str(newCave)+"\n")
+
+ if created and line[UnofficialName]:
+ newUnofficialName = models.OtherCaveName(cave = newCave, name = line[UnofficialName])
+ newUnofficialName.save()
+
+ logging.info("Added unofficial name "+str(newUnofficialName)+" to cave "+str(newCave)+"\n")
+
+
+ if created and line[MultipleEntrances] == '' or \
+ line[MultipleEntrances] == 'entrance' or \
+ line[MultipleEntrances] == 'last entrance':
+ args = {}
+
+ if line[Entrances]:
+ entrance_letter = line[Entrances]
+ else:
+ entrance_letter = ''
+
+ def addToArgs(CSVname, modelName):
+ if line[CSVname]:
+ args[modelName] = line[CSVname]
+ def addToArgsViaDict(CSVname, modelName, dictionary):
+ if line[CSVname]:
+ args[modelName] = dictionary[line[CSVname]]
+ addToArgs(EntranceName, 'name')
+ addToArgs(Explorers, 'explorers')
+ addToArgs(Map, 'map_description')
+ addToArgs(Location, 'location_description')
+ addToArgs(Approach, 'approach')
+ addToArgs(EntranceDescription, 'entrance_description')
+ addToArgs(UndergroundDescription, 'underground_description')
+ addToArgs(PhotoOfLocation, 'photo')
+ addToArgsViaDict(Marking, 'marking', {"Paint": "P",
+ "Paint (?)": "P?",
+ "Tag": "T",
+ "Tag (?)": "T?",
+ "Retagged": "R",
+ "Retag": "R",
+ "Spit": "S",
+ "Spit (?)": "S?",
+ "Unmarked": "U",
+ "": "?",
+ })
+
+ addToArgs(MarkingComment, 'marking_comment')
+ addToArgsViaDict(Findability, 'findability', {"Surveyed": "S",
+ "Lost": "L",
+ "Refindable": "R",
+ "": "?",
+ "?": "?",
+ })
+ addToArgs(FindabilityComment, 'findability_description')
+ addToArgs(Easting, 'easting')
+ addToArgs(Northing, 'northing')
+ addToArgs(Altitude, 'alt')
+ addToArgs(DescriptionOfOtherPoint, 'other_description')
+ addToArgs(TagPoint, 'tag_station')
+ addToArgs(ExactEntrance, 'exact_station')
+ addToArgs(OtherPoint, 'other_station')
+ addToArgs(OtherPoint, 'other_description')
+ if line[GPSpreSA]:
+ addToArgs(GPSpreSA, 'other_station')
+ args['other_description'] = 'pre selective availability GPS'
+ if line[GPSpostSA]:
+ addToArgs(GPSpostSA, 'other_station')
+ args['other_description'] = 'post selective availability GPS'
+ addToArgs(Bearings, 'bearings')
+ args['slug'] = newCave.slug + entrance_letter
+ newEntrance = models.Entrance(**args)
+ newEntrance.save()
+
+ logging.info("Added entrance "+str(newEntrance)+"\n")
+
+
+ newCaveAndEntrance = models.CaveAndEntrance(cave = newCave, entrance = newEntrance, entrance_letter = entrance_letter)
+ newCaveAndEntrance.save()
+
+ logging.info("Added CaveAndEntrance "+str(newCaveAndEntrance)+"\n")
+ if line[AutogenFile] != "":
+ f = flatpages.models.EntranceRedirect(originalURL = line[AutogenFile], entrance = newEntrance)
+ f.save()
+
+
+# lookup function modelled on GetPersonExpeditionNameLookup
+Gcavelookup = None
+def GetCaveLookup():
+ global Gcavelookup
+ if Gcavelookup:
+ return Gcavelookup
+ Gcavelookup = {"NONEPLACEHOLDER":None}
+ for cave in models.Cave.objects.all():
+ Gcavelookup[cave.official_name.lower()] = cave
+ if cave.kataster_number:
+ Gcavelookup[cave.kataster_number] = cave
+ if cave.unofficial_number:
+ Gcavelookup[cave.unofficial_number] = cave
+
+ Gcavelookup["tunnocks"] = Gcavelookup["258"]
+ Gcavelookup["hauchhole"] = Gcavelookup["234"]
+ return Gcavelookup
+
+
diff --git a/parsers/descriptions.py b/parsers/descriptions.py
index 2bca267..cf744fe 100644
--- a/parsers/descriptions.py
+++ b/parsers/descriptions.py
@@ -1,45 +1,45 @@
-from django.conf import settings
-import core.models as models
-import os
-from utils import html_to_wiki, get_html_body, get_html_title
-
-pages = [(["smkridge", "204", "ariston-rigging.html"], "ariston-rigging"),
- (["smkridge", "204", "ariston.html"], "ariston"),
- (["smkridge", "204", "bivvy.html"], "bivvy"),
- (["smkridge", "204", "bridge.html"], "bridge"),
- (["smkridge", "204", "entrance-rigging.html"], "entrance-rigging"),
- (["smkridge", "204", "entrance.html"], "entrance"),
- (["smkridge", "204", "midlevel.html"], "midlevel"),
- (["smkridge", "204", "millennium.html"], "millennium"),
- (["smkridge", "204", "nopain.html"], "nopain"),
- (["smkridge", "204", "razordance.html"], "razordance"),
- (["smkridge", "204", "rhino.html"], "rhino"),
- (["smkridge", "204", "sbview.html"], "sbview"),
- (["smkridge", "204", "subway.html"], "subway"),
- (["smkridge", "204", "swings.html"], "swings"),
- (["smkridge", "204", "treeumphant.html"], "treeumphant"),
- (["smkridge", "204", "uworld.html"], "uworld"), ]
-
-
-def getDescriptions():
- """Creates objects in the database for each item in the list 'pages' . """
- for filelocation, name in pages:
- f = open(os.path.join(settings.EXPOWEB, *filelocation), "r")
- html = f.read()
-
- cd = models.CaveDescription(short_name = name,
- long_name = unicode(get_html_title(html), "latin1"),
- description = unicode(get_html_body(html), "latin1"))
- cd.save()
-
-def parseDescriptions():
- """Turns the HTML in each cave description into wikicode"""
- for cd in models.CaveDescription.objects.all():
- cd.description = html_to_wiki(cd.description)
-
- cd.save()
-
-def parseDescriptionsOnCaveObjects():
- for cave in models.Cave.objects.all():
- cave.underground_description=html_to_wiki(unicode(cave.underground_description))
+from django.conf import settings
+import core.models as models
+import os
+from utils import html_to_wiki, get_html_body, get_html_title
+
+pages = [(["smkridge", "204", "ariston-rigging.html"], "ariston-rigging"),
+ (["smkridge", "204", "ariston.html"], "ariston"),
+ (["smkridge", "204", "bivvy.html"], "bivvy"),
+ (["smkridge", "204", "bridge.html"], "bridge"),
+ (["smkridge", "204", "entrance-rigging.html"], "entrance-rigging"),
+ (["smkridge", "204", "entrance.html"], "entrance"),
+ (["smkridge", "204", "midlevel.html"], "midlevel"),
+ (["smkridge", "204", "millennium.html"], "millennium"),
+ (["smkridge", "204", "nopain.html"], "nopain"),
+ (["smkridge", "204", "razordance.html"], "razordance"),
+ (["smkridge", "204", "rhino.html"], "rhino"),
+ (["smkridge", "204", "sbview.html"], "sbview"),
+ (["smkridge", "204", "subway.html"], "subway"),
+ (["smkridge", "204", "swings.html"], "swings"),
+ (["smkridge", "204", "treeumphant.html"], "treeumphant"),
+ (["smkridge", "204", "uworld.html"], "uworld"), ]
+
+
+def getDescriptions():
+ """Creates objects in the database for each item in the list 'pages' . """
+ for filelocation, name in pages:
+ f = open(os.path.join(settings.EXPOWEB, *filelocation), "r")
+ html = f.read()
+
+ cd = models.CaveDescription(short_name = name,
+ long_name = unicode(get_html_title(html), "latin1"),
+ description = unicode(get_html_body(html), "latin1"))
+ cd.save()
+
+def parseDescriptions():
+ """Turns the HTML in each cave description into wikicode"""
+ for cd in models.CaveDescription.objects.all():
+ cd.description = html_to_wiki(cd.description)
+
+ cd.save()
+
+def parseDescriptionsOnCaveObjects():
+ for cave in models.Cave.objects.all():
+ cave.underground_description=html_to_wiki(unicode(cave.underground_description))
cave.save() \ No newline at end of file
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index af01f46..c794f9f 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,432 +1,432 @@
-#.-*- coding: utf-8 -*-
-
-from django.conf import settings
-import core.models as models
-
-from parsers.people import GetPersonExpeditionNameLookup
-from parsers.cavetab import GetCaveLookup
-
-from django.template.defaultfilters import slugify
-
-import csv
-import re
-import datetime
-import os
-
-from utils import save_carefully
-
-#
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
-#
-
-#
-# the logbook loading section
-#
-def GetTripPersons(trippeople, expedition, logtime_underground):
- res = [ ]
- author = None
- for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
- tripperson = tripperson.strip()
- mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
- if mul:
- tripperson = mul.group(1).strip()
- if tripperson and tripperson[0] != '*':
- #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
- personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
- if not personyear:
- print "NoMatchFor: '%s'" % tripperson
- res.append((personyear, logtime_underground))
- if mul:
- author = personyear
- if not author:
- if not res:
- return None, None
- author = res[-1][0]
- return res, author
-
-def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
-# print "Getting cave for " , place
- try:
- katastNumRes=[]
- katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
- except ValueError:
- pass
- officialNameRes=list(models.Cave.objects.filter(official_name=place))
- tripCaveRes=officialNameRes+katastNumRes
-
- if len(tripCaveRes)==1:
-# print "Place " , place , "entered as" , tripCaveRes[0]
- return tripCaveRes[0]
-
- elif models.OtherCaveName.objects.filter(name=place):
- tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
-# print "Place " , place , "entered as" , tripCaveRes
- return tripCaveRes
-
- elif len(tripCaveRes)>1:
- print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
- correctIndex=input("type list index of correct cave")
- return tripCaveRes[correctIndex]
- else:
- print "No cave found for place " , place
- return
-
-
-noncaveplaces = [ "Journey", "Loser Plateau" ]
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
- """ saves a logbook entry and related persontrips """
- trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
- if not author:
- print "skipping logentry", title
- return
-
-# tripCave = GetTripCave(place)
- #
- lplace = place.lower()
- if lplace not in noncaveplaces:
- cave=GetCaveLookup().get(lplace)
-
- #Check for an existing copy of the current entry, and save
- expeditionday = expedition.get_expedition_day(date)
- lookupAttribs={'date':date, 'title':title}
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
- lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
-
- for tripperson, time_underground in trippersons:
- lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
- nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
- #print nonLookupAttribs
- save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
-
-
-def ParseDate(tripdate, year):
- """ Interprets dates in the expo logbooks and returns a correct datetime.date object """
- mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
- mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
- if mdatestandard:
- assert mdatestandard.group(1) == year, (tripdate, year)
- year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
- elif mdategoof:
- assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
- yadd = int(year[:2]) * 100
- day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
- else:
- assert False, tripdate
- return datetime.date(year, month, day)
-
-# 2007, 2008, 2006
-def Parselogwikitxt(year, expedition, txt):
- trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
- for triphead, triptext in trippara:
- tripheadp = triphead.split("|")
- #print "ttt", tripheadp
- assert len(tripheadp) == 3, (tripheadp, triptext)
- tripdate, tripplace, trippeople = tripheadp
- tripsplace = tripplace.split(" - ")
- tripcave = tripsplace[0].strip()
-
- tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
- if tul:
- #assert len(tul) <= 1, (triphead, triptext)
- #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
- tu = tul[0][0]
- else:
- tu = ""
- #assert tripcave == "Journey", (triphead, triptext)
-
- #print tripdate
- ldate = ParseDate(tripdate.strip(), year)
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-# 2002, 2004, 2005
-def Parseloghtmltxt(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
-
- s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
- \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
- \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
- \s*<div\s+class="trippeople">\s*(.*?)</div>
- \s*<div\s+class="triptitle">\s*(.*?)</div>
- ([\s\S]*?)
- \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
- \s*$
- ''', trippara)
- if not s:
- if not re.search("Rigging Guide", trippara):
- print "can't parse: ", trippara # this is 2007 which needs editing
- #assert s, trippara
- continue
-
- tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
- ldate = ParseDate(tripdate.strip(), year)
- #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
- trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
- triptitles = triptitle.split(" - ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-# main parser for pre-2001. simpler because the data has been hacked so much to fit it
-def Parseloghtml01(year, expedition, txt):
- tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
- assert s, trippara[:300]
- tripheader, triptext = s.group(1), s.group(2)
- mtripid = re.search('<a id="(.*?)"', tripheader)
- tripid = mtripid and mtripid.group(1) or ""
- tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
-
- #print " ", [tripheader]
- #continue
-
- tripdate, triptitle, trippeople = tripheader.split("|")
- ldate = ParseDate(tripdate.strip(), year)
-
- mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
- if mtu:
- tu = mtu.group(1)
- triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
- else:
- tu = ""
-
- triptitles = triptitle.split(" - ")
- tripcave = triptitles[0].strip()
-
- ltriptext = triptext
-
- mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
- if mtail:
- #print mtail.group(0)
- ltriptext = ltriptext[:mtail.start(0)]
- ltriptext = re.sub("</p>", "", ltriptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
- #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
- ltriptext = re.sub("</?u>", "_", ltriptext)
- ltriptext = re.sub("</?i>", "''", ltriptext)
- ltriptext = re.sub("</?b>", "'''", ltriptext)
-
-
- #print ldate, trippeople.strip()
- # could includ the tripid (url link for cross referencing)
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-def Parseloghtml03(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
- for trippara in tripparas:
- s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
- assert s, trippara
- tripheader, triptext = s.group(1), s.group(2)
- tripheader = re.sub("&nbsp;", " ", tripheader)
- tripheader = re.sub("\s+", " ", tripheader).strip()
- sheader = tripheader.split(" -- ")
- tu = ""
- if re.match("T/U|Time underwater", sheader[-1]):
- tu = sheader.pop()
- if len(sheader) != 3:
- print "header not three pieces", sheader
- tripdate, triptitle, trippeople = sheader
- ldate = ParseDate(tripdate.strip(), year)
- triptitles = triptitle.split(" , ")
- if len(triptitles) >= 2:
- tripcave = triptitles[0]
- else:
- tripcave = "UNKNOWN"
- #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-yearlinks = [
- ("2009", "2009/2009logbook.txt", Parselogwikitxt),
- ("2008", "2008/2008logbook.txt", Parselogwikitxt),
- ("2007", "2007/logbook.html", Parseloghtmltxt),
- ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
- ("2005", "2005/logbook.html", Parseloghtmltxt),
- ("2004", "2004/logbook.html", Parseloghtmltxt),
- ("2003", "2003/logbook.html", Parseloghtml03),
- ("2002", "2002/logbook.html", Parseloghtmltxt),
- ("2001", "2001/log.htm", Parseloghtml01),
- ("2000", "2000/log.htm", Parseloghtml01),
- ("1999", "1999/log.htm", Parseloghtml01),
- ("1998", "1998/log.htm", Parseloghtml01),
- ("1997", "1997/log.htm", Parseloghtml01),
- ("1996", "1996/log.htm", Parseloghtml01),
- ("1995", "1995/log.htm", Parseloghtml01),
- ("1994", "1994/log.htm", Parseloghtml01),
- ("1993", "1993/log.htm", Parseloghtml01),
- ("1992", "1992/log.htm", Parseloghtml01),
- ("1991", "1991/log.htm", Parseloghtml01),
- ]
-
-def SetDatesFromLogbookEntries(expedition):
- """
- Sets the date_from and date_to field for an expedition based on persontrips.
- Then sets the expedition date_from and date_to based on the personexpeditions.
- """
- for personexpedition in expedition.personexpedition_set.all():
- persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
- # sequencing is difficult to do
- lprevpersontrip = None
- for persontrip in persontrips:
- persontrip.persontrip_prev = lprevpersontrip
- if lprevpersontrip:
- lprevpersontrip.persontrip_next = persontrip
- lprevpersontrip.save()
- persontrip.persontrip_next = None
- lprevpersontrip = persontrip
- persontrip.save()
-
-
-
-def LoadLogbookForExpedition(expedition):
- """ Parses all logbook entries for one expedition """
-
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- year = str(expedition.year)
- for lyear, lloc, parsefunc in yearlinks:
- if lyear == year:
- break
- fin = open(os.path.join(expowebbase, lloc))
- print "opennning", lloc
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
- return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
-
-
-def LoadLogbooks():
- """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
-
- #Deletion has been moved to a seperate function to enable the non-destructive importing
- #models.LogbookEntry.objects.all().delete()
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
- #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
-
- for year, lloc, parsefunc in yearlinks:
- expedition = models.Expedition.objects.filter(year = year)[0]
- fin = open(os.path.join(expowebbase, lloc))
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
-
-dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
-
-def parseAutoLogBookEntry(filename):
- errors = []
- f = open(filename, "r")
- contents = f.read()
- f.close()
-
- dateMatch = dateRegex.search(contents)
- if dateMatch:
- year, month, day = [int(x) for x in dateMatch.groups()]
- date = datetime.date(year, month, day)
- else:
- errors.append("Date could not be found")
-
- expeditionYearMatch = expeditionYearRegex.search(contents)
- if expeditionYearMatch:
- try:
- expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
- personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
- except models.Expedition.DoesNotExist:
- errors.append("Expedition not in database")
- else:
- errors.append("Expediton Year could not be parsed")
-
- titleMatch = titleRegex.search(contents)
- if titleMatch:
- title, = titleMatch.groups()
- if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
- errors.append("Title too long")
- else:
- errors.append("Title could not be found")
-
- caveMatch = caveRegex.search(contents)
- if caveMatch:
- caveRef, = caveMatch.groups()
- try:
- cave = models.getCaveByReference(caveRef)
- except AssertionError:
- cave = None
- errors.append("Cave not found in database")
- else:
- cave = None
-
- locationMatch = locationRegex.search(contents)
- if locationMatch:
- location, = locationMatch.groups()
- else:
- location = None
-
- if cave is None and location is None:
- errors.append("Location nor cave could not be found")
-
- reportMatch = reportRegex.search(contents)
- if reportMatch:
- report, = reportMatch.groups()
- else:
- errors.append("Contents could not be found")
- if errors:
- return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
- people = []
- for personMatch in personRegex.findall(contents):
- nameAuthorMatch = nameAuthorRegex.search(contents)
- if nameAuthorMatch:
- author, name = nameAuthorMatch.groups()
- if name.lower() in personExpeditionNameLookup:
- personExpo = personExpeditionNameLookup[name.lower()]
- else:
- errors.append("Person could not be found in database")
- author = bool(author)
- else:
- errors.append("Persons name could not be found")
-
- TUMatch = TURegex.search(contents)
- if TUMatch:
- TU, = TUMatch.groups()
- else:
- errors.append("TU could not be found")
- if not errors:
- people.append((name, author, TU))
- if errors:
- return errors # Bail out before commiting to the database
- logbookEntry = models.LogbookEntry(date = date,
- expedition = expedition,
- title = title, cave = cave, place = location,
- text = report, slug = slugify(title)[:50],
- filename = filename)
- logbookEntry.save()
- for name, author, TU in people:
- models.PersonTrip(personexpedition = personExpo,
- time_underground = TU,
- logbook_entry = logbookEntry,
- is_logbook_entry_author = author).save()
- print logbookEntry
+#.-*- coding: utf-8 -*-
+
+from django.conf import settings
+import core.models as models
+
+from parsers.people import GetPersonExpeditionNameLookup
+from parsers.cavetab import GetCaveLookup
+
+from django.template.defaultfilters import slugify
+
+import csv
+import re
+import datetime
+import os
+
+from utils import save_carefully
+
+#
+# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
+# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+#
+
+#
+# the logbook loading section
+#
+def GetTripPersons(trippeople, expedition, logtime_underground):
+ res = [ ]
+ author = None
+ for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
+ tripperson = tripperson.strip()
+ mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
+ if mul:
+ tripperson = mul.group(1).strip()
+ if tripperson and tripperson[0] != '*':
+ #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
+ if not personyear:
+ print "NoMatchFor: '%s'" % tripperson
+ res.append((personyear, logtime_underground))
+ if mul:
+ author = personyear
+ if not author:
+ if not res:
+ return None, None
+ author = res[-1][0]
+ return res, author
+
+def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
+# print "Getting cave for " , place
+ try:
+ katastNumRes=[]
+ katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
+ except ValueError:
+ pass
+ officialNameRes=list(models.Cave.objects.filter(official_name=place))
+ tripCaveRes=officialNameRes+katastNumRes
+
+ if len(tripCaveRes)==1:
+# print "Place " , place , "entered as" , tripCaveRes[0]
+ return tripCaveRes[0]
+
+ elif models.OtherCaveName.objects.filter(name=place):
+ tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
+# print "Place " , place , "entered as" , tripCaveRes
+ return tripCaveRes
+
+ elif len(tripCaveRes)>1:
+ print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
+ correctIndex=input("type list index of correct cave")
+ return tripCaveRes[correctIndex]
+ else:
+ print "No cave found for place " , place
+ return
+
+
+noncaveplaces = [ "Journey", "Loser Plateau" ]
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
+ """ saves a logbook entry and related persontrips """
+ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
+ if not author:
+ print "skipping logentry", title
+ return
+
+# tripCave = GetTripCave(place)
+ #
+ lplace = place.lower()
+ if lplace not in noncaveplaces:
+ cave=GetCaveLookup().get(lplace)
+
+ #Check for an existing copy of the current entry, and save
+ expeditionday = expedition.get_expedition_day(date)
+ lookupAttribs={'date':date, 'title':title}
+ nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
+ lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
+
+ for tripperson, time_underground in trippersons:
+ lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
+ nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
+ #print nonLookupAttribs
+ save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
+
+
+def ParseDate(tripdate, year):
+ """ Interprets dates in the expo logbooks and returns a correct datetime.date object """
+ mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+ if mdatestandard:
+ assert mdatestandard.group(1) == year, (tripdate, year)
+ year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
+ elif mdategoof:
+ assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
+ yadd = int(year[:2]) * 100
+ day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
+ else:
+ assert False, tripdate
+ return datetime.date(year, month, day)
+
+# 2007, 2008, 2006
+def Parselogwikitxt(year, expedition, txt):
+ trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+ for triphead, triptext in trippara:
+ tripheadp = triphead.split("|")
+ #print "ttt", tripheadp
+ assert len(tripheadp) == 3, (tripheadp, triptext)
+ tripdate, tripplace, trippeople = tripheadp
+ tripsplace = tripplace.split(" - ")
+ tripcave = tripsplace[0].strip()
+
+ tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+ if tul:
+ #assert len(tul) <= 1, (triphead, triptext)
+ #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
+ tu = tul[0][0]
+ else:
+ tu = ""
+ #assert tripcave == "Journey", (triphead, triptext)
+
+ #print tripdate
+ ldate = ParseDate(tripdate.strip(), year)
+ #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+# 2002, 2004, 2005
+def Parseloghtmltxt(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+
+ s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
+ \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
+ \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
+ \s*<div\s+class="trippeople">\s*(.*?)</div>
+ \s*<div\s+class="triptitle">\s*(.*?)</div>
+ ([\s\S]*?)
+ \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
+ \s*$
+ ''', trippara)
+ if not s:
+ if not re.search("Rigging Guide", trippara):
+ print "can't parse: ", trippara # this is 2007 which needs editing
+ #assert s, trippara
+ continue
+
+ tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
+ ldate = ParseDate(tripdate.strip(), year)
+ #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
+ trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
+ trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
+ triptitles = triptitle.split(" - ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
+ tripcave = "UNKNOWN"
+ #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
+ ltriptext = re.sub("</p>", "", triptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+
+# main parser for pre-2001. simpler because the data has been hacked so much to fit it
+def Parseloghtml01(year, expedition, txt):
+ tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
+ assert s, trippara[:300]
+ tripheader, triptext = s.group(1), s.group(2)
+ mtripid = re.search('<a id="(.*?)"', tripheader)
+ tripid = mtripid and mtripid.group(1) or ""
+ tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
+
+ #print " ", [tripheader]
+ #continue
+
+ tripdate, triptitle, trippeople = tripheader.split("|")
+ ldate = ParseDate(tripdate.strip(), year)
+
+ mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
+ if mtu:
+ tu = mtu.group(1)
+ triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
+ else:
+ tu = ""
+
+ triptitles = triptitle.split(" - ")
+ tripcave = triptitles[0].strip()
+
+ ltriptext = triptext
+
+ mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+ if mtail:
+ #print mtail.group(0)
+ ltriptext = ltriptext[:mtail.start(0)]
+ ltriptext = re.sub("</p>", "", ltriptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
+ #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
+ ltriptext = re.sub("</?u>", "_", ltriptext)
+ ltriptext = re.sub("</?i>", "''", ltriptext)
+ ltriptext = re.sub("</?b>", "'''", ltriptext)
+
+
+ #print ldate, trippeople.strip()
+ # could includ the tripid (url link for cross referencing)
+ EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+
+def Parseloghtml03(year, expedition, txt):
+ tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ for trippara in tripparas:
+ s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
+ assert s, trippara
+ tripheader, triptext = s.group(1), s.group(2)
+ tripheader = re.sub("&nbsp;", " ", tripheader)
+ tripheader = re.sub("\s+", " ", tripheader).strip()
+ sheader = tripheader.split(" -- ")
+ tu = ""
+ if re.match("T/U|Time underwater", sheader[-1]):
+ tu = sheader.pop()
+ if len(sheader) != 3:
+ print "header not three pieces", sheader
+ tripdate, triptitle, trippeople = sheader
+ ldate = ParseDate(tripdate.strip(), year)
+ triptitles = triptitle.split(" , ")
+ if len(triptitles) >= 2:
+ tripcave = triptitles[0]
+ else:
+ tripcave = "UNKNOWN"
+ #print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
+ ltriptext = re.sub("</p>", "", triptext)
+ ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
+ ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
+
+yearlinks = [
+ ("2009", "2009/2009logbook.txt", Parselogwikitxt),
+ ("2008", "2008/2008logbook.txt", Parselogwikitxt),
+ ("2007", "2007/logbook.html", Parseloghtmltxt),
+ ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
+ ("2005", "2005/logbook.html", Parseloghtmltxt),
+ ("2004", "2004/logbook.html", Parseloghtmltxt),
+ ("2003", "2003/logbook.html", Parseloghtml03),
+ ("2002", "2002/logbook.html", Parseloghtmltxt),
+ ("2001", "2001/log.htm", Parseloghtml01),
+ ("2000", "2000/log.htm", Parseloghtml01),
+ ("1999", "1999/log.htm", Parseloghtml01),
+ ("1998", "1998/log.htm", Parseloghtml01),
+ ("1997", "1997/log.htm", Parseloghtml01),
+ ("1996", "1996/log.htm", Parseloghtml01),
+ ("1995", "1995/log.htm", Parseloghtml01),
+ ("1994", "1994/log.htm", Parseloghtml01),
+ ("1993", "1993/log.htm", Parseloghtml01),
+ ("1992", "1992/log.htm", Parseloghtml01),
+ ("1991", "1991/log.htm", Parseloghtml01),
+ ]
+
+def SetDatesFromLogbookEntries(expedition):
+ """
+ Sets the date_from and date_to field for an expedition based on persontrips.
+ Then sets the expedition date_from and date_to based on the personexpeditions.
+ """
+ for personexpedition in expedition.personexpedition_set.all():
+ persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
+ # sequencing is difficult to do
+ lprevpersontrip = None
+ for persontrip in persontrips:
+ persontrip.persontrip_prev = lprevpersontrip
+ if lprevpersontrip:
+ lprevpersontrip.persontrip_next = persontrip
+ lprevpersontrip.save()
+ persontrip.persontrip_next = None
+ lprevpersontrip = persontrip
+ persontrip.save()
+
+
+
+def LoadLogbookForExpedition(expedition):
+ """ Parses all logbook entries for one expedition """
+
+ expowebbase = os.path.join(settings.EXPOWEB, "years")
+ year = str(expedition.year)
+ for lyear, lloc, parsefunc in yearlinks:
+ if lyear == year:
+ break
+ fin = open(os.path.join(expowebbase, lloc))
+ print "opennning", lloc
+ txt = fin.read().decode("latin1")
+ fin.close()
+ parsefunc(year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+ return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
+
+
+def LoadLogbooks():
+ """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
+
+ #Deletion has been moved to a seperate function to enable the non-destructive importing
+ #models.LogbookEntry.objects.all().delete()
+ expowebbase = os.path.join(settings.EXPOWEB, "years")
+ #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
+ #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
+
+ for year, lloc, parsefunc in yearlinks:
+ expedition = models.Expedition.objects.filter(year = year)[0]
+ fin = open(os.path.join(expowebbase, lloc))
+ txt = fin.read().decode("latin1")
+ fin.close()
+ parsefunc(year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+
+dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
+titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
+reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
+personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
+nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
+TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
+caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
+
+def parseAutoLogBookEntry(filename):
+ errors = []
+ f = open(filename, "r")
+ contents = f.read()
+ f.close()
+
+ dateMatch = dateRegex.search(contents)
+ if dateMatch:
+ year, month, day = [int(x) for x in dateMatch.groups()]
+ date = datetime.date(year, month, day)
+ else:
+ errors.append("Date could not be found")
+
+ expeditionYearMatch = expeditionYearRegex.search(contents)
+ if expeditionYearMatch:
+ try:
+ expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
+ personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
+ except models.Expedition.DoesNotExist:
+ errors.append("Expedition not in database")
+ else:
+ errors.append("Expediton Year could not be parsed")
+
+ titleMatch = titleRegex.search(contents)
+ if titleMatch:
+ title, = titleMatch.groups()
+ if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
+ errors.append("Title too long")
+ else:
+ errors.append("Title could not be found")
+
+ caveMatch = caveRegex.search(contents)
+ if caveMatch:
+ caveRef, = caveMatch.groups()
+ try:
+ cave = models.getCaveByReference(caveRef)
+ except AssertionError:
+ cave = None
+ errors.append("Cave not found in database")
+ else:
+ cave = None
+
+ locationMatch = locationRegex.search(contents)
+ if locationMatch:
+ location, = locationMatch.groups()
+ else:
+ location = None
+
+ if cave is None and location is None:
+ errors.append("Location nor cave could not be found")
+
+ reportMatch = reportRegex.search(contents)
+ if reportMatch:
+ report, = reportMatch.groups()
+ else:
+ errors.append("Contents could not be found")
+ if errors:
+ return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
+ people = []
+ for personMatch in personRegex.findall(contents):
+ nameAuthorMatch = nameAuthorRegex.search(contents)
+ if nameAuthorMatch:
+ author, name = nameAuthorMatch.groups()
+ if name.lower() in personExpeditionNameLookup:
+ personExpo = personExpeditionNameLookup[name.lower()]
+ else:
+ errors.append("Person could not be found in database")
+ author = bool(author)
+ else:
+ errors.append("Persons name could not be found")
+
+ TUMatch = TURegex.search(contents)
+ if TUMatch:
+ TU, = TUMatch.groups()
+ else:
+ errors.append("TU could not be found")
+ if not errors:
+ people.append((name, author, TU))
+ if errors:
+ return errors # Bail out before commiting to the database
+ logbookEntry = models.LogbookEntry(date = date,
+ expedition = expedition,
+ title = title, cave = cave, place = location,
+ text = report, slug = slugify(title)[:50],
+ filename = filename)
+ logbookEntry.save()
+ for name, author, TU in people:
+ models.PersonTrip(personexpedition = personExpo,
+ time_underground = TU,
+ logbook_entry = logbookEntry,
+ is_logbook_entry_author = author).save()
+ print logbookEntry
diff --git a/parsers/subcaves.py b/parsers/subcaves.py
index bdd64a1..6905d0a 100644
--- a/parsers/subcaves.py
+++ b/parsers/subcaves.py
@@ -1,56 +1,56 @@
-'''
-This module is the part of troggle that parses descriptions of cave parts (subcaves) from the legacy html files and saves them in the troggle database as instances of the model Subcave. Unfortunately, this parser can not be very flexible because the legacy format is poorly structured.
-'''
-
-import sys, os
-
-import os, re, logging
-from django.conf import settings
-from core.models import Subcave, Cave
-from utils import save_carefully
-
-def getLinksInCaveDescription(cave):
- '''
- Returns all HTML <a href> tags from a given cave as a list of tuples
- in the format ('filename.html','Description')
- '''
- pattern='<a href=\"(.*?)\">(.*?)</a>'
- if cave.underground_description:
- return re.findall(pattern,cave.underground_description)
- else:
- return []
-
-def importSubcaves(cave):
- for link in getLinksInCaveDescription(cave):
- try:
- subcaveFilePath=os.path.join(
- settings.EXPOWEB,
- os.path.dirname(cave.description_file),
- link[0])
- subcaveFile=open(subcaveFilePath,'r')
- description=subcaveFile.read().decode('iso-8859-1').encode('utf-8')
-
- lookupAttribs={'title':link[1], 'cave':cave}
- nonLookupAttribs={'description':description}
- newSubcave=save_carefully(Subcave,lookupAttribs=lookupAttribs,nonLookupAttribs=nonLookupAttribs)
-
- logging.info("Added " + unicode(newSubcave) + " to " + unicode(cave))
- except IOError:
- logging.info("Subcave import couldn't open "+subcaveFilePath)
-
-def getLinksInSubcaveDescription(subcave):
- pattern='<a href=\"(.*?)\">(.*?)</a>'
- if subcave.description:
- return re.findall(pattern,subcave.description)
- else:
- return []
-
-def getLinksInAllSubcaves():
- bigList=[]
- for subcave in Subcave.objects.all():
- bigList+=getLinksInSubcaveDescription(subcave)
- return bigList
-
-def importAllSubcaves():
- for cave in Cave.objects.all():
- importSubcaves(cave)
+'''
+This module is the part of troggle that parses descriptions of cave parts (subcaves) from the legacy html files and saves them in the troggle database as instances of the model Subcave. Unfortunately, this parser can not be very flexible because the legacy format is poorly structured.
+'''
+
+import sys, os
+
+import os, re, logging
+from django.conf import settings
+from core.models import Subcave, Cave
+from utils import save_carefully
+
+def getLinksInCaveDescription(cave):
+ '''
+ Returns all HTML <a href> tags from a given cave as a list of tuples
+ in the format ('filename.html','Description')
+ '''
+ pattern='<a href=\"(.*?)\">(.*?)</a>'
+ if cave.underground_description:
+ return re.findall(pattern,cave.underground_description)
+ else:
+ return []
+
+def importSubcaves(cave):
+ for link in getLinksInCaveDescription(cave):
+ try:
+ subcaveFilePath=os.path.join(
+ settings.EXPOWEB,
+ os.path.dirname(cave.description_file),
+ link[0])
+ subcaveFile=open(subcaveFilePath,'r')
+ description=subcaveFile.read().decode('iso-8859-1').encode('utf-8')
+
+ lookupAttribs={'title':link[1], 'cave':cave}
+ nonLookupAttribs={'description':description}
+ newSubcave=save_carefully(Subcave,lookupAttribs=lookupAttribs,nonLookupAttribs=nonLookupAttribs)
+
+ logging.info("Added " + unicode(newSubcave) + " to " + unicode(cave))
+ except IOError:
+ logging.info("Subcave import couldn't open "+subcaveFilePath)
+
+def getLinksInSubcaveDescription(subcave):
+ pattern='<a href=\"(.*?)\">(.*?)</a>'
+ if subcave.description:
+ return re.findall(pattern,subcave.description)
+ else:
+ return []
+
+def getLinksInAllSubcaves():
+ bigList=[]
+ for subcave in Subcave.objects.all():
+ bigList+=getLinksInSubcaveDescription(subcave)
+ return bigList
+
+def importAllSubcaves():
+ for cave in Cave.objects.all():
+ importSubcaves(cave)
diff --git a/parsers/surveys.py b/parsers/surveys.py
index 4ac5067..0c6eb4b 100644
--- a/parsers/surveys.py
+++ b/parsers/surveys.py
@@ -1,301 +1,301 @@
-import sys, os, types, logging, stat
-#sys.path.append('C:\\Expo\\expoweb')
-#from troggle import *
-#os.environ['DJANGO_SETTINGS_MODULE']='troggle.settings'
-import settings
-from core.models import *
-from PIL import Image
-#import settings
-#import core.models as models
-import csv
-import re
-import datetime
-from utils import save_carefully
-
-def get_or_create_placeholder(year):
- """ All surveys must be related to a logbookentry. We don't have a way to
- automatically figure out which survey went with which logbookentry,
- so we create a survey placeholder logbook entry for each year. This
- function always returns such a placeholder, and creates it if it doesn't
- exist yet.
- """
- lookupAttribs={'date__year':int(year), 'title':"placeholder for surveys",}
- nonLookupAttribs={'text':"surveys temporarily attached to this should be re-attached to their actual trips", 'date':datetime.date(int(year),1,1)}
- placeholder_logbook_entry, newly_created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
- return placeholder_logbook_entry
-
-# dead
-def readSurveysFromCSV():
- try: # could probably combine these two
- surveytab = open(os.path.join(settings.SURVEY_SCANS, "Surveys.csv"))
- except IOError:
- import cStringIO, urllib
- surveytab = cStringIO.StringIO(urllib.urlopen(settings.SURVEY_SCANS + "/Surveys.csv").read())
- dialect=csv.Sniffer().sniff(surveytab.read())
- surveytab.seek(0,0)
- surveyreader = csv.reader(surveytab,dialect=dialect)
- headers = surveyreader.next()
- header = dict(zip(headers, range(len(headers)))) #set up a dictionary where the indexes are header names and the values are column numbers
-
- # test if the expeditions have been added yet
- if Expedition.objects.count()==0:
- print "There are no expeditions in the database. Please run the logbook parser."
- sys.exit()
-
-
- logging.info("Deleting all scanned images")
- ScannedImage.objects.all().delete()
-
-
- logging.info("Deleting all survey objects")
- Survey.objects.all().delete()
-
-
- logging.info("Beginning to import surveys from "+str(os.path.join(settings.SURVEYS, "Surveys.csv"))+"\n"+"-"*60+"\n")
-
- for survey in surveyreader:
- #I hate this, but some surveys have a letter eg 2000#34a. The next line deals with that.
- walletNumberLetter = re.match(r'(?P<number>\d*)(?P<letter>[a-zA-Z]*)',survey[header['Survey Number']])
- # print walletNumberLetter.groups()
- year=survey[header['Year']]
-
-
- surveyobj = Survey(
- expedition = Expedition.objects.filter(year=year)[0],
- wallet_number = walletNumberLetter.group('number'),
- logbook_entry = get_or_create_placeholder(year),
- comments = survey[header['Comments']],
- location = survey[header['Location']]
- )
- surveyobj.wallet_letter = walletNumberLetter.group('letter')
- if survey[header['Finished']]=='Yes':
- #try and find the sketch_scan
- pass
- surveyobj.save()
-
-
- logging.info("added survey " + survey[header['Year']] + "#" + surveyobj.wallet_number + "\r")
-
-# dead
-def listdir(*directories):
- try:
- return os.listdir(os.path.join(settings.SURVEYS, *directories))
- except:
- import urllib
- url = settings.SURVEYS + reduce(lambda x, y: x + "/" + y, ["listdir"] + list(directories))
- folders = urllib.urlopen(url.replace("#", "%23")).readlines()
- return [folder.rstrip(r"/") for folder in folders]
-
-# add survey scans
-def parseSurveyScans(year, logfile=None):
-# yearFileList = listdir(year.year)
- yearPath=os.path.join(settings.SURVEY_SCANS, "years", year.year)
- yearFileList=os.listdir(yearPath)
- print yearFileList
- for surveyFolder in yearFileList:
- try:
- surveyNumber=re.match(r'\d\d\d\d#0*(\d+)',surveyFolder).groups()
-# scanList = listdir(year.year, surveyFolder)
- scanList=os.listdir(os.path.join(yearPath,surveyFolder))
- except AttributeError:
- print surveyFolder + " ignored",
- continue
-
- for scan in scanList:
- try:
- scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups()
- scanType,scanNumber,scanFormat=scanChopped
- except AttributeError:
- print scan + " ignored \r",
- continue
- if scanType == 'elev' or scanType == 'extend':
- scanType = 'elevation'
-
- if scanNumber=='':
- scanNumber=1
-
- if type(surveyNumber)==types.TupleType:
- surveyNumber=surveyNumber[0]
- try:
- placeholder=get_or_create_placeholder(year=int(year.year))
- survey=Survey.objects.get_or_create(wallet_number=surveyNumber, expedition=year, defaults={'logbook_entry':placeholder})[0]
- except Survey.MultipleObjectsReturned:
- survey=Survey.objects.filter(wallet_number=surveyNumber, expedition=year)[0]
- file=os.path.join(year.year, surveyFolder, scan)
- scanObj = ScannedImage(
- file=file,
- contents=scanType,
- number_in_wallet=scanNumber,
- survey=survey,
- new_since_parsing=False,
- )
- #print "Added scanned image at " + str(scanObj)
- if scanFormat=="png":
- if isInterlacedPNG(os.path.join(settings.SURVEY_SCANS,file)):
- print file + " is an interlaced PNG. No can do."
- continue
- scanObj.save()
-
-# dead
-def parseSurveys(logfile=None):
- readSurveysFromCSV()
- for year in Expedition.objects.filter(year__gte=2000): #expos since 2000, because paths and filenames were nonstandard before then
- parseSurveyScans(year)
-
-# dead
-def isInterlacedPNG(filePath): #We need to check for interlaced PNGs because the thumbnail engine can't handle them (uses PIL)
- file=Image.open(filePath)
- print filePath
- if 'interlace' in file.info:
- return file.info['interlace']
- else:
- return False
-
-
-# handles url or file, so we can refer to a set of scans on another server
-def GetListDir(sdir):
- res = [ ]
- if sdir[:7] == "http://":
- assert False, "Not written"
- s = urllib.urlopen(sdir)
- else:
- for f in os.listdir(sdir):
- if f[0] != ".":
- ff = os.path.join(sdir, f)
- res.append((f, ff, os.path.isdir(ff)))
- return res
-
-
-
-
-
-def LoadListScansFile(survexscansfolder):
- gld = [ ]
-
- # flatten out any directories in these book files
- for (fyf, ffyf, fisdiryf) in GetListDir(survexscansfolder.fpath):
- if fisdiryf:
- gld.extend(GetListDir(ffyf))
- else:
- gld.append((fyf, ffyf, fisdiryf))
-
- for (fyf, ffyf, fisdiryf) in gld:
- assert not fisdiryf, ffyf
- if re.search("\.(?:png|jpg|jpeg)(?i)$", fyf):
- survexscansingle = SurvexScanSingle(ffile=ffyf, name=fyf, survexscansfolder=survexscansfolder)
- survexscansingle.save()
-
-
-# this iterates through the scans directories (either here or on the remote server)
-# and builds up the models we can access later
-def LoadListScans():
- SurvexScanSingle.objects.all().delete()
- SurvexScansFolder.objects.all().delete()
-
- # first do the smkhs (large kh survey scans) directory
- survexscansfoldersmkhs = SurvexScansFolder(fpath=os.path.join(settings.SURVEY_SCANS, "smkhs"), walletname="smkhs")
- if os.path.isdir(survexscansfoldersmkhs.fpath):
- survexscansfoldersmkhs.save()
- LoadListScansFile(survexscansfoldersmkhs)
-
-
- # iterate into the surveyscans directory
- for f, ff, fisdir in GetListDir(os.path.join(settings.SURVEY_SCANS, "surveyscans")):
- if not fisdir:
- continue
-
- # do the year folders
- if re.match("\d\d\d\d$", f):
- for fy, ffy, fisdiry in GetListDir(ff):
- if fisdiry:
- assert fisdiry, ffy
- survexscansfolder = SurvexScansFolder(fpath=ffy, walletname=fy)
- survexscansfolder.save()
- LoadListScansFile(survexscansfolder)
-
- # do the
- elif f != "thumbs":
- survexscansfolder = SurvexScansFolder(fpath=ff, walletname=f)
- survexscansfolder.save()
- LoadListScansFile(survexscansfolder)
-
-
-def FindTunnelScan(tunnelfile, path):
- scansfolder, scansfile = None, None
- mscansdir = re.search("(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg))$", path)
- if mscansdir:
- scansfolderl = SurvexScansFolder.objects.filter(walletname=mscansdir.group(1))
- if len(scansfolderl):
- assert len(scansfolderl) == 1
- scansfolder = scansfolderl[0]
- if scansfolder:
- scansfilel = scansfolder.survexscansingle_set.filter(name=mscansdir.group(2))
- if len(scansfilel):
- assert len(scansfilel) == 1
- scansfile = scansfilel[0]
-
- if scansfolder:
- tunnelfile.survexscansfolders.add(scansfolder)
- if scansfile:
- tunnelfile.survexscans.add(scansfile)
-
- elif path and not re.search("\.(?:png|jpg)$(?i)", path):
- name = os.path.split(path)[1]
- print "ttt", tunnelfile.tunnelpath, path, name
- rtunnelfilel = TunnelFile.objects.filter(tunnelname=name)
- if len(rtunnelfilel):
- assert len(rtunnelfilel) == 1, ("two paths with name of", path, "need more discrimination coded")
- rtunnelfile = rtunnelfilel[0]
- #print "ttt", tunnelfile.tunnelpath, path, name, rtunnelfile.tunnelpath
- tunnelfile.tunnelcontains.add(rtunnelfile)
-
- tunnelfile.save()
-
-
-def SetTunnelfileInfo(tunnelfile):
- ff = os.path.join(settings.TUNNEL_DATA, tunnelfile.tunnelpath)
- tunnelfile.filesize = os.stat(ff)[stat.ST_SIZE]
- fin = open(ff)
- ttext = fin.read()
- fin.close()
-
- mtype = re.search("<(fontcolours|sketch)", ttext)
- assert mtype, ff
- tunnelfile.bfontcolours = (mtype.group(1)=="fontcolours")
- tunnelfile.npaths = len(re.findall("<skpath", ttext))
- tunnelfile.save()
-
- # <tunnelxml tunnelversion="version2009-06-21 Matienzo" tunnelproject="ireby" tunneluser="goatchurch" tunneldate="2009-06-29 23:22:17">
- # <pcarea area_signal="frame" sfscaledown="12.282584" sfrotatedeg="-90.76982" sfxtrans="11.676667377221136" sfytrans="-15.677173422877454" sfsketch="204description/scans/plan(38).png" sfstyle="" nodeconnzsetrelative="0.0">
- for path, style in re.findall('<pcarea area_signal="frame".*?sfsketch="([^"]*)" sfstyle="([^"]*)"', ttext):
- FindTunnelScan(tunnelfile, path)
-
- # should also scan and look for survex blocks that might have been included
- # and also survex titles as well.
-
- tunnelfile.save()
-
-
-def LoadTunnelFiles():
- tunneldatadir = settings.TUNNEL_DATA
- TunnelFile.objects.all().delete()
- tunneldirs = [ "" ]
- while tunneldirs:
- tunneldir = tunneldirs.pop()
- for f in os.listdir(os.path.join(tunneldatadir, tunneldir)):
- if f[0] == "." or f[-1] == "~":
- continue
- lf = os.path.join(tunneldir, f)
- ff = os.path.join(tunneldatadir, lf)
- if os.path.isdir(ff):
- tunneldirs.append(lf)
- elif f[-4:] == ".xml":
- tunnelfile = TunnelFile(tunnelpath=lf, tunnelname=os.path.split(f[:-4])[1])
- tunnelfile.save()
-
- for tunnelfile in TunnelFile.objects.all():
- SetTunnelfileInfo(tunnelfile)
-
-
-
-
+import sys, os, types, logging, stat
+#sys.path.append('C:\\Expo\\expoweb')
+#from troggle import *
+#os.environ['DJANGO_SETTINGS_MODULE']='troggle.settings'
+import settings
+from core.models import *
+from PIL import Image
+#import settings
+#import core.models as models
+import csv
+import re
+import datetime
+from utils import save_carefully
+
+def get_or_create_placeholder(year):
+ """ All surveys must be related to a logbookentry. We don't have a way to
+ automatically figure out which survey went with which logbookentry,
+ so we create a survey placeholder logbook entry for each year. This
+ function always returns such a placeholder, and creates it if it doesn't
+ exist yet.
+ """
+ lookupAttribs={'date__year':int(year), 'title':"placeholder for surveys",}
+ nonLookupAttribs={'text':"surveys temporarily attached to this should be re-attached to their actual trips", 'date':datetime.date(int(year),1,1)}
+ placeholder_logbook_entry, newly_created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
+ return placeholder_logbook_entry
+
+# dead
+def readSurveysFromCSV():
+ try: # could probably combine these two
+ surveytab = open(os.path.join(settings.SURVEY_SCANS, "Surveys.csv"))
+ except IOError:
+ import cStringIO, urllib
+ surveytab = cStringIO.StringIO(urllib.urlopen(settings.SURVEY_SCANS + "/Surveys.csv").read())
+ dialect=csv.Sniffer().sniff(surveytab.read())
+ surveytab.seek(0,0)
+ surveyreader = csv.reader(surveytab,dialect=dialect)
+ headers = surveyreader.next()
+ header = dict(zip(headers, range(len(headers)))) #set up a dictionary where the indexes are header names and the values are column numbers
+
+ # test if the expeditions have been added yet
+ if Expedition.objects.count()==0:
+ print "There are no expeditions in the database. Please run the logbook parser."
+ sys.exit()
+
+
+ logging.info("Deleting all scanned images")
+ ScannedImage.objects.all().delete()
+
+
+ logging.info("Deleting all survey objects")
+ Survey.objects.all().delete()
+
+
+ logging.info("Beginning to import surveys from "+str(os.path.join(settings.SURVEYS, "Surveys.csv"))+"\n"+"-"*60+"\n")
+
+ for survey in surveyreader:
+ #I hate this, but some surveys have a letter eg 2000#34a. The next line deals with that.
+ walletNumberLetter = re.match(r'(?P<number>\d*)(?P<letter>[a-zA-Z]*)',survey[header['Survey Number']])
+ # print walletNumberLetter.groups()
+ year=survey[header['Year']]
+
+
+ surveyobj = Survey(
+ expedition = Expedition.objects.filter(year=year)[0],
+ wallet_number = walletNumberLetter.group('number'),
+ logbook_entry = get_or_create_placeholder(year),
+ comments = survey[header['Comments']],
+ location = survey[header['Location']]
+ )
+ surveyobj.wallet_letter = walletNumberLetter.group('letter')
+ if survey[header['Finished']]=='Yes':
+ #try and find the sketch_scan
+ pass
+ surveyobj.save()
+
+
+ logging.info("added survey " + survey[header['Year']] + "#" + surveyobj.wallet_number + "\r")
+
+# dead
+def listdir(*directories):
+ try:
+ return os.listdir(os.path.join(settings.SURVEYS, *directories))
+ except:
+ import urllib
+ url = settings.SURVEYS + reduce(lambda x, y: x + "/" + y, ["listdir"] + list(directories))
+ folders = urllib.urlopen(url.replace("#", "%23")).readlines()
+ return [folder.rstrip(r"/") for folder in folders]
+
+# add survey scans
+def parseSurveyScans(year, logfile=None):
+# yearFileList = listdir(year.year)
+ yearPath=os.path.join(settings.SURVEY_SCANS, "years", year.year)
+ yearFileList=os.listdir(yearPath)
+ print yearFileList
+ for surveyFolder in yearFileList:
+ try:
+ surveyNumber=re.match(r'\d\d\d\d#0*(\d+)',surveyFolder).groups()
+# scanList = listdir(year.year, surveyFolder)
+ scanList=os.listdir(os.path.join(yearPath,surveyFolder))
+ except AttributeError:
+ print surveyFolder + " ignored",
+ continue
+
+ for scan in scanList:
+ try:
+ scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups()
+ scanType,scanNumber,scanFormat=scanChopped
+ except AttributeError:
+ print scan + " ignored \r",
+ continue
+ if scanType == 'elev' or scanType == 'extend':
+ scanType = 'elevation'
+
+ if scanNumber=='':
+ scanNumber=1
+
+ if type(surveyNumber)==types.TupleType:
+ surveyNumber=surveyNumber[0]
+ try:
+ placeholder=get_or_create_placeholder(year=int(year.year))
+ survey=Survey.objects.get_or_create(wallet_number=surveyNumber, expedition=year, defaults={'logbook_entry':placeholder})[0]
+ except Survey.MultipleObjectsReturned:
+ survey=Survey.objects.filter(wallet_number=surveyNumber, expedition=year)[0]
+ file=os.path.join(year.year, surveyFolder, scan)
+ scanObj = ScannedImage(
+ file=file,
+ contents=scanType,
+ number_in_wallet=scanNumber,
+ survey=survey,
+ new_since_parsing=False,
+ )
+ #print "Added scanned image at " + str(scanObj)
+ if scanFormat=="png":
+ if isInterlacedPNG(os.path.join(settings.SURVEY_SCANS,file)):
+ print file + " is an interlaced PNG. No can do."
+ continue
+ scanObj.save()
+
+# dead
+def parseSurveys(logfile=None):
+ readSurveysFromCSV()
+ for year in Expedition.objects.filter(year__gte=2000): #expos since 2000, because paths and filenames were nonstandard before then
+ parseSurveyScans(year)
+
+# dead
+def isInterlacedPNG(filePath): #We need to check for interlaced PNGs because the thumbnail engine can't handle them (uses PIL)
+ file=Image.open(filePath)
+ print filePath
+ if 'interlace' in file.info:
+ return file.info['interlace']
+ else:
+ return False
+
+
+# handles url or file, so we can refer to a set of scans on another server
+def GetListDir(sdir):
+ res = [ ]
+ if sdir[:7] == "http://":
+ assert False, "Not written"
+ s = urllib.urlopen(sdir)
+ else:
+ for f in os.listdir(sdir):
+ if f[0] != ".":
+ ff = os.path.join(sdir, f)
+ res.append((f, ff, os.path.isdir(ff)))
+ return res
+
+
+
+
+
+def LoadListScansFile(survexscansfolder):
+ gld = [ ]
+
+ # flatten out any directories in these book files
+ for (fyf, ffyf, fisdiryf) in GetListDir(survexscansfolder.fpath):
+ if fisdiryf:
+ gld.extend(GetListDir(ffyf))
+ else:
+ gld.append((fyf, ffyf, fisdiryf))
+
+ for (fyf, ffyf, fisdiryf) in gld:
+ assert not fisdiryf, ffyf
+ if re.search("\.(?:png|jpg|jpeg)(?i)$", fyf):
+ survexscansingle = SurvexScanSingle(ffile=ffyf, name=fyf, survexscansfolder=survexscansfolder)
+ survexscansingle.save()
+
+
+# this iterates through the scans directories (either here or on the remote server)
+# and builds up the models we can access later
+def LoadListScans():
+ SurvexScanSingle.objects.all().delete()
+ SurvexScansFolder.objects.all().delete()
+
+ # first do the smkhs (large kh survey scans) directory
+ survexscansfoldersmkhs = SurvexScansFolder(fpath=os.path.join(settings.SURVEY_SCANS, "smkhs"), walletname="smkhs")
+ if os.path.isdir(survexscansfoldersmkhs.fpath):
+ survexscansfoldersmkhs.save()
+ LoadListScansFile(survexscansfoldersmkhs)
+
+
+ # iterate into the surveyscans directory
+ for f, ff, fisdir in GetListDir(os.path.join(settings.SURVEY_SCANS, "surveyscans")):
+ if not fisdir:
+ continue
+
+ # do the year folders
+ if re.match("\d\d\d\d$", f):
+ for fy, ffy, fisdiry in GetListDir(ff):
+ if fisdiry:
+ assert fisdiry, ffy
+ survexscansfolder = SurvexScansFolder(fpath=ffy, walletname=fy)
+ survexscansfolder.save()
+ LoadListScansFile(survexscansfolder)
+
+ # do the
+ elif f != "thumbs":
+ survexscansfolder = SurvexScansFolder(fpath=ff, walletname=f)
+ survexscansfolder.save()
+ LoadListScansFile(survexscansfolder)
+
+
+def FindTunnelScan(tunnelfile, path):
+ scansfolder, scansfile = None, None
+ mscansdir = re.search("(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg))$", path)
+ if mscansdir:
+ scansfolderl = SurvexScansFolder.objects.filter(walletname=mscansdir.group(1))
+ if len(scansfolderl):
+ assert len(scansfolderl) == 1
+ scansfolder = scansfolderl[0]
+ if scansfolder:
+ scansfilel = scansfolder.survexscansingle_set.filter(name=mscansdir.group(2))
+ if len(scansfilel):
+ assert len(scansfilel) == 1
+ scansfile = scansfilel[0]
+
+ if scansfolder:
+ tunnelfile.survexscansfolders.add(scansfolder)
+ if scansfile:
+ tunnelfile.survexscans.add(scansfile)
+
+ elif path and not re.search("\.(?:png|jpg)$(?i)", path):
+ name = os.path.split(path)[1]
+ print "ttt", tunnelfile.tunnelpath, path, name
+ rtunnelfilel = TunnelFile.objects.filter(tunnelname=name)
+ if len(rtunnelfilel):
+ assert len(rtunnelfilel) == 1, ("two paths with name of", path, "need more discrimination coded")
+ rtunnelfile = rtunnelfilel[0]
+ #print "ttt", tunnelfile.tunnelpath, path, name, rtunnelfile.tunnelpath
+ tunnelfile.tunnelcontains.add(rtunnelfile)
+
+ tunnelfile.save()
+
+
+def SetTunnelfileInfo(tunnelfile):
+ ff = os.path.join(settings.TUNNEL_DATA, tunnelfile.tunnelpath)
+ tunnelfile.filesize = os.stat(ff)[stat.ST_SIZE]
+ fin = open(ff)
+ ttext = fin.read()
+ fin.close()
+
+ mtype = re.search("<(fontcolours|sketch)", ttext)
+ assert mtype, ff
+ tunnelfile.bfontcolours = (mtype.group(1)=="fontcolours")
+ tunnelfile.npaths = len(re.findall("<skpath", ttext))
+ tunnelfile.save()
+
+ # <tunnelxml tunnelversion="version2009-06-21 Matienzo" tunnelproject="ireby" tunneluser="goatchurch" tunneldate="2009-06-29 23:22:17">
+ # <pcarea area_signal="frame" sfscaledown="12.282584" sfrotatedeg="-90.76982" sfxtrans="11.676667377221136" sfytrans="-15.677173422877454" sfsketch="204description/scans/plan(38).png" sfstyle="" nodeconnzsetrelative="0.0">
+ for path, style in re.findall('<pcarea area_signal="frame".*?sfsketch="([^"]*)" sfstyle="([^"]*)"', ttext):
+ FindTunnelScan(tunnelfile, path)
+
+ # should also scan and look for survex blocks that might have been included
+ # and also survex titles as well.
+
+ tunnelfile.save()
+
+
+def LoadTunnelFiles():
+ tunneldatadir = settings.TUNNEL_DATA
+ TunnelFile.objects.all().delete()
+ tunneldirs = [ "" ]
+ while tunneldirs:
+ tunneldir = tunneldirs.pop()
+ for f in os.listdir(os.path.join(tunneldatadir, tunneldir)):
+ if f[0] == "." or f[-1] == "~":
+ continue
+ lf = os.path.join(tunneldir, f)
+ ff = os.path.join(tunneldatadir, lf)
+ if os.path.isdir(ff):
+ tunneldirs.append(lf)
+ elif f[-4:] == ".xml":
+ tunnelfile = TunnelFile(tunnelpath=lf, tunnelname=os.path.split(f[:-4])[1])
+ tunnelfile.save()
+
+ for tunnelfile in TunnelFile.objects.all():
+ SetTunnelfileInfo(tunnelfile)
+
+
+
+