summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'parsers')
-rw-r--r--parsers/QMs.py20
-rw-r--r--parsers/caves.py50
-rw-r--r--parsers/cavesM.py213
-rw-r--r--parsers/logbooks.py257
-rw-r--r--parsers/people.py81
-rw-r--r--parsers/peopleM.py27
-rw-r--r--parsers/survex.py262
-rw-r--r--parsers/surveys.py7
-rw-r--r--parsers/surveysM.py65
9 files changed, 420 insertions, 562 deletions
diff --git a/parsers/QMs.py b/parsers/QMs.py
index efc8cd6..602b7af 100644
--- a/parsers/QMs.py
+++ b/parsers/QMs.py
@@ -17,19 +17,19 @@ def parseCaveQMs(cave,inputFile):
try:
steinBr=Cave.objects.get(official_name="Steinbrückenhöhle")
except Cave.DoesNotExist:
- print "Steinbruckenhoehle is not in the database. Please run parsers.cavetab first."
+ print("Steinbruckenhoehle is not in the database. Please run parsers.cavetab first.")
return
elif cave=='hauch':
try:
hauchHl=Cave.objects.get(official_name="Hauchhöhle")
except Cave.DoesNotExist:
- print "Hauchhoele is not in the database. Please run parsers.cavetab first."
+ print("Hauchhoele is not in the database. Please run parsers.cavetab first.")
return
elif cave =='kh':
try:
kh=Cave.objects.get(official_name="Kaninchenhöhle")
except Cave.DoesNotExist:
- print "KH is not in the database. Please run parsers.cavetab first."
+ print("KH is not in the database. Please run parsers.cavetab first.")
parse_KH_QMs(kh, inputFile=inputFile)
return
@@ -48,7 +48,7 @@ def parseCaveQMs(cave,inputFile):
elif cave=='hauch':
placeholder, hadToCreate = LogbookEntry.objects.get_or_create(date__year=year, title="placeholder for QMs in 234", text="QMs temporarily attached to this should be re-attached to their actual trips", defaults={"date": date(year, 1, 1),"cave":hauchHl})
if hadToCreate:
- print cave+" placeholder logbook entry for " + str(year) + " added to database"
+ print(cave + " placeholder logbook entry for " + str(year) + " added to database")
QMnum=re.match(r".*?-\d*?-X?(?P<numb>\d*)",line[0]).group("numb")
newQM = QM()
newQM.found_by=placeholder
@@ -71,19 +71,18 @@ def parseCaveQMs(cave,inputFile):
if preexistingQM.new_since_parsing==False: #if the pre-existing QM has not been modified, overwrite it
preexistingQM.delete()
newQM.save()
- print "overwriting " + str(preexistingQM) +"\r",
-
+ print("overwriting " + str(preexistingQM) +"\r")
else: # otherwise, print that it was ignored
- print "preserving "+ str(preexistingQM) + ", which was edited in admin \r",
+ print("preserving " + str(preexistingQM) + ", which was edited in admin \r")
except QM.DoesNotExist: #if there is no pre-existing QM, save the new one
newQM.save()
- print "QM "+str(newQM) + ' added to database\r',
+ print("QM "+str(newQM) + ' added to database\r')
except KeyError: #check on this one
continue
except IndexError:
- print "Index error in " + str(line)
+ print("Index error in " + str(line))
continue
def parse_KH_QMs(kh, inputFile):
@@ -104,7 +103,7 @@ def parse_KH_QMs(kh, inputFile):
}
nonLookupArgs={
'grade':res['grade'],
- 'nearest_station':res['nearest_station'],
+ 'nearest_station_name':res['nearest_station'],
'location_description':res['description']
}
@@ -115,3 +114,4 @@ parseCaveQMs(cave='stein',inputFile=r"1623/204/qm.csv")
parseCaveQMs(cave='hauch',inputFile=r"1623/234/qm.csv")
parseCaveQMs(cave='kh', inputFile="1623/161/qmtodo.htm")
#parseCaveQMs(cave='balkonhoehle',inputFile=r"1623/264/qm.csv")
+
diff --git a/parsers/caves.py b/parsers/caves.py
index ba1c358..606007f 100644
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -6,16 +6,18 @@ import re
def readcaves():
- newArea = models.Area(short_name = "1623", parent = None)
- newArea.save()
- newArea = models.Area(short_name = "1626", parent = None)
- newArea.save()
- print("Reading Entrances")
+
+ # Clear the cave data issues as we are reloading
+ models.DataIssue.objects.filter(parser='caves').delete()
+
+ area_1623 = models.Area.objects.update_or_create(short_name = "1623", parent = None)
+ area_1626 = models.Area.objects.update_or_create(short_name = "1626", parent = None)
+ print(" - Reading Entrances")
#print "list of <Slug> <Filename>"
for filename in os.walk(settings.ENTRANCEDESCRIPTIONS).next()[2]: #Should be a better way of getting a list of files
if filename.endswith('.html'):
readentrance(filename)
- print ("Reading Caves")
+ print (" - Reading Caves")
for filename in os.walk(settings.CAVEDESCRIPTIONS).next()[2]: #Should be a better way of getting a list of files
if filename.endswith('.html'):
readcave(filename)
@@ -51,7 +53,7 @@ def readentrance(filename):
bearings = getXML(entrancecontents, "bearings", maxItems = 1, context = context)
url = getXML(entrancecontents, "url", maxItems = 1, context = context)
if len(non_public) == 1 and len(slugs) >= 1 and len(name) >= 1 and len(entrance_description) == 1 and len(explorers) == 1 and len(map_description) == 1 and len(location_description) == 1 and len(approach) == 1 and len(underground_description) == 1 and len(marking) == 1 and len(marking_comment) == 1 and len(findability) == 1 and len(findability_description) == 1 and len(alt) == 1 and len(northing) == 1 and len(easting) == 1 and len(tag_station) == 1 and len(exact_station) == 1 and len(other_station) == 1 and len(other_description) == 1 and len(bearings) == 1 and len(url) == 1:
- e = models.Entrance(name = name[0],
+ e, state = models.Entrance.objects.update_or_create(name = name[0],
non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]],
entrance_description = entrance_description[0],
explorers = explorers[0],
@@ -75,14 +77,12 @@ def readentrance(filename):
url = url[0],
filename = filename,
cached_primary_slug = slugs[0])
- e.save()
primary = True
for slug in slugs:
#print slug, filename
- cs = models.EntranceSlug(entrance = e,
+ cs = models.EntranceSlug.objects.update_or_create(entrance = e,
slug = slug,
primary = primary)
- cs.save()
primary = False
def readcave(filename):
@@ -117,7 +117,7 @@ def readcave(filename):
url = getXML(cavecontents, "url", maxItems = 1, context = context)
entrances = getXML(cavecontents, "entrance", context = context)
if len(non_public) == 1 and len(slugs) >= 1 and len(official_name) == 1 and len(areas) >= 1 and len(kataster_code) == 1 and len(kataster_number) == 1 and len(unofficial_number) == 1 and len(explorers) == 1 and len(underground_description) == 1 and len(equipment) == 1 and len(references) == 1 and len(survey) == 1 and len(kataster_status) == 1 and len(underground_centre_line) == 1 and len(notes) == 1 and len(length) == 1 and len(depth) == 1 and len(extent) == 1 and len(survex_file) == 1 and len(description_file ) == 1 and len(url) == 1 and len(entrances) >= 1:
- c = models.Cave(non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]],
+ c, state = models.Cave.objects.update_or_create(non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]],
official_name = official_name[0],
kataster_code = kataster_code[0],
kataster_number = kataster_number[0],
@@ -137,7 +137,6 @@ def readcave(filename):
description_file = description_file[0],
url = url[0],
filename = filename)
- c.save()
for area_slug in areas:
area = models.Area.objects.filter(short_name = area_slug)
if area:
@@ -149,12 +148,13 @@ def readcave(filename):
primary = True
for slug in slugs:
try:
- cs = models.CaveSlug(cave = c,
+ cs = models.CaveSlug.objects.update_or_create(cave = c,
slug = slug,
primary = primary)
- cs.save()
except:
- print("Can't find text (slug): %s, skipping %s" % (slug, context))
+ message = "Can't find text (slug): %s, skipping %s" % (slug, context)
+ models.DataIssue.objects.create(parser='caves', message=message)
+ print(message)
primary = False
for entrance in entrances:
@@ -162,20 +162,26 @@ def readcave(filename):
letter = getXML(entrance, "letter", maxItems = 1, context = context)[0]
try:
entrance = models.Entrance.objects.get(entranceslug__slug = slug)
- ce = models.CaveAndEntrance(cave = c, entrance_letter = letter, entrance = entrance)
- ce.save()
+ ce = models.CaveAndEntrance.objects.update_or_create(cave = c, entrance_letter = letter, entrance = entrance)
except:
- print ("Entrance text (slug) %s missing %s" % (slug, context))
+ message = "Entrance text (slug) %s missing %s" % (slug, context)
+ models.DataIssue.objects.create(parser='caves', message=message)
+ print(message)
def getXML(text, itemname, minItems = 1, maxItems = None, printwarnings = True, context = ""):
items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
if len(items) < minItems and printwarnings:
- print("%(count)i %(itemname)s found, at least %(min)i expected" % {"count": len(items),
+ message = "%(count)i %(itemname)s found, at least %(min)i expected" % {"count": len(items),
"itemname": itemname,
- "min": minItems} + context)
+ "min": minItems} + context
+ models.DataIssue.objects.create(parser='caves', message=message)
+ print(message)
+
if maxItems is not None and len(items) > maxItems and printwarnings:
- print("%(count)i %(itemname)s found, no more than %(max)i expected" % {"count": len(items),
+ message = "%(count)i %(itemname)s found, no more than %(max)i expected" % {"count": len(items),
"itemname": itemname,
- "max": maxItems} + context)
+ "max": maxItems} + context
+ models.DataIssue.objects.create(parser='caves', message=message)
+ print(message)
return items
diff --git a/parsers/cavesM.py b/parsers/cavesM.py
deleted file mode 100644
index f9900d6..0000000
--- a/parsers/cavesM.py
+++ /dev/null
@@ -1,213 +0,0 @@
-
-import troggle.core.models as models #import models for various objects
-from django.conf import settings
-import xml.etree.ElementTree as ET #this is used to parse XML's
-import subprocess
-import re
-
-#
-# This parser has to find several things:
-# There are files of .html format in expoweb area - they contain some of the important information
-# There is a similar number of .svx files in loser are - they contain all the measurements
-#
-# Previous version was incredibly slow due to various shitty ideas about finding things
-# and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
-# and handles more sophisticated bits only
-#
-
-def load():
- print('Hi! I\'m caves parser. Ready to work')
-
- print('Loading caves of 1623 area')
- loadarea('1623')
-
- print('Loading caves of 1626 area')
- loadarea('1626')
-
-
-def loadarea(areacode):
-
- if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'):
- print('Computing master .3d file')
- bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')
- else:
- print('Loading from existing master .3d file')
-
- master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines()
- master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file
- master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file
-
- print('Searching all cave dirs files')
- basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'
-
- cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
- print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
- ndirs = len(cavedirs) #remember number of dirs for nice debug output
-
- for cavedir in cavedirs:
- if cavedir==basedir:
- continue #skip the basedir - a non-proper subdirectory
- cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
-
- test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
- if not file_exists(cavedir+'/'+cavename+'.svx'):
- msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
- print('Cave missing'+cavename+' :(')
- msg.save()
- continue
- fullname=cavedir+'/'+cavename+'.svx'
- print('Found cave:'+cavename)
- cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing
- if 'cavern: error:' in cavernout:
- msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
- print('Fucked svx'+cavename+' :(')
- msg.save()
- continue
-
- cavernout = cavernout.splitlines()
- depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
- length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
- cavefile = open(fullname,'r')
- cavefilecontents = cavefile.read().splitlines()
- surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower()
- try:
- title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1]
- except:
- syrveyname = "Untitled"
-
- relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
- entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x]
- surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x]
- location_nodes = []
- print('rel_nodes'+str(len(relevant_nodes)))
- if len(entrance_nodes) > 0:
- location_nodes = entrance_nodes
- elif len(surface_nodes) > 0:
- location_nodes = surface_nodes
- elif len(relevant_nodes) > 0:
- location_nodes = relevant_nodes
-
- try:
- location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop()
- except:
- print(location_nodes)
- location = 'Not found'
-
- relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
- try:
- lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop()
- except:
- lastleg = ['LINE 1900.01.01']
- try:
- lastdate = lastleg.split().pop()
- if 'STYLE' in lastdate:
- lastdate = lastleg.split().pop().pop()
- except:
- lastdate = '1900.01.01'
-
- entrance = ' '.join(location.split()[1:3])
- print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print
-
-
- newcave = models.CaveM(
- survex_file = fullname,
- total_length = length,
- name=areacode+'.'+surveyname,
- total_depth = depth,
- date = lastdate,
- entrance = entrance)
- newcave.save()
- #end of reading survex masterfiles
-
- print ("Reading cave descriptions")
- cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
- for fn in cavefiles:
- f = open(fn, "r")
- print(fn)
- contents = f.read()
-
- slug = re.sub(r"\s+", "", extractXML(contents,'caveslug'))
- desc = extractXML(contents,'underground_description')
- name = slug[5:] #get survex compatible name
- area = slug[0:4]
-
- print([area,name])
-
- if desc==None or name==None:
- msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
- print('Fucked description '+fn+' :(')
- msg.save()
- continue
-
- print(area+'/'+name+'/'+name+'.svx')
-
- updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx')
- if len(updatecave)>1:
- print('Non unique solution - skipping. Name:'+name)
- elif len(updatecave)==0:
- print('Cave with no survex data:'+name)
- continue
- else: #exaclty one match
- print('Adding desc:'+name)
- updatecave = updatecave[0]
- updatecave.description = '/cave/descriptionM/'+slug #area-name
- updatecave.title=name
- updatecave.save()
-
- slugS = slug
- explorersS = extractXML(contents,'explorers')
- underground_descriptionS = extractXML(contents,'underground_description')
- equipmentS = extractXML(contents,'equipment')
- referencesS = extractXML(contents,'references')
- surveyS = extractXML(contents,'survey')
- kataster_statusS = extractXML(contents,'kataster_status')
- underground_centre_lineS = extractXML(contents,'underground_centre_line')
- survex_fileS = extractXML(contents,'survex_file')
- notesS = extractXML(contents,'notes')
-
-
- newcavedesc = models.Cave_descriptionM(
- slug = slugS,
- explorers = explorersS,
- underground_description = underground_descriptionS,
- equipment = equipmentS,
- references = referencesS,
- survey = surveyS,
- kataster_status = kataster_statusS,
- underground_centre_line = underground_centre_lineS,
- survex_file = survex_fileS,
- notes = notesS)
- newcavedesc.save()
-
-
-
-
- #end of reading cave descriptions
-
-def file_exists(filename):
- test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence
- if 'MISSING' in test: #send error message to the database
- return False
- return True
-
-def extractXML(contents,tag):
- #find correct lines
- lines = contents.splitlines()
- beg = [x for x in lines if ('<'+tag+'>' in x)]
- end = [x for x in lines if ('</'+tag+'>' in x)]
- if (not beg) or (not end):
- return None
- begi = lines.index(beg[0])
- endi = lines.index(end[0])
- if endi!=begi:
- segment = '\n'.join(lines[begi:endi+1])
- else:
- segment = lines[begi:endi+1][0]
-
- hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0]
- return hit
-
-def bash(cmd): #calls command in bash shell, returns output
- process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
- output, error = process.communicate()
- return output
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index cb40f58..9dfa31b 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -7,6 +7,8 @@ from parsers.people import GetPersonExpeditionNameLookup
from parsers.cavetab import GetCaveLookup
from django.template.defaultfilters import slugify
+from django.utils.timezone import get_current_timezone
+from django.utils.timezone import make_aware
import csv
import re
@@ -23,19 +25,23 @@ from utils import save_carefully
#
# the logbook loading section
#
-def GetTripPersons(trippeople, expedition, logtime_underground):
+def GetTripPersons(trippeople, expedition, logtime_underground):
res = [ ]
author = None
- for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
+ round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
+ for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
- mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
+ mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
- print "NoMatchFor: '%s'" % tripperson
+ print(" - No name match for: '%s'" % tripperson)
+ message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year)
+ models.DataIssue.objects.create(parser='logbooks', message=message)
res.append((personyear, logtime_underground))
if mul:
author = personyear
@@ -45,7 +51,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
author = res[-1][0]
return res, author
-def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
+def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try:
katastNumRes=[]
@@ -65,32 +71,34 @@ def GetTripCave(place): #need to be fuzzier about matching h
return tripCaveRes
elif len(tripCaveRes)>1:
- print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
+ print("Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes))
correctIndex=input("type list index of correct cave")
return tripCaveRes[correctIndex]
else:
- print "No cave found for place " , place
+ print("No cave found for place " , place)
return
noncaveplaces = [ "Journey", "Loser Plateau" ]
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
""" saves a logbook entry and related persontrips """
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
- print "skipping logentry", title
+ print(" - Skipping logentry: " + title + " - no author for entry")
+ message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
+ models.DataIssue.objects.create(parser='logbooks', message=message)
return
-
-# tripCave = GetTripCave(place)
- #
+
+ #tripCave = GetTripCave(place)
+
lplace = place.lower()
if lplace not in noncaveplaces:
cave=GetCaveLookup().get(lplace)
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
- lookupAttribs={'date':date, 'title':title}
- nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
+ lookupAttribs={'date':date, 'title':title}
+ nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type}
lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
for tripperson, time_underground in trippersons:
@@ -102,8 +110,8 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
- mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
- mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
+ mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
+ mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
assert mdatestandard.group(1) == year, (tripdate, year)
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
@@ -115,9 +123,9 @@ def ParseDate(tripdate, year):
assert False, tripdate
return datetime.date(year, month, day)
-# 2007, 2008, 2006
+# 2006, 2008 - 2010
def Parselogwikitxt(year, expedition, txt):
- trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
+ trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
tripheadp = triphead.split("|")
#print "ttt", tripheadp
@@ -126,7 +134,7 @@ def Parselogwikitxt(year, expedition, txt):
tripsplace = tripplace.split(" - ")
tripcave = tripsplace[0].strip()
- tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
+ tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul:
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
@@ -140,12 +148,16 @@ def Parselogwikitxt(year, expedition, txt):
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-# 2002, 2004, 2005
+# 2002, 2004, 2005, 2007, 2011 - 2018
def Parseloghtmltxt(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ #print(" - Starting log html parser")
+ tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ logbook_entry_count = 0
for trippara in tripparas:
+ #print(" - HR detected - maybe a trip?")
+ logbook_entry_count += 1
- s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
+ s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="trippeople">\s*(.*?)</div>
@@ -155,38 +167,41 @@ def Parseloghtmltxt(year, expedition, txt):
\s*$
''', trippara)
if not s:
- if not re.search("Rigging Guide", trippara):
- print "can't parse: ", trippara # this is 2007 which needs editing
+ if not re.search(r"Rigging Guide", trippara):
+ print("can't parse: ", trippara) # this is 2007 which needs editing
#assert s, trippara
continue
-
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
- trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
+ #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
+ #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
- #print "\n", tripcave, "--- ppp", trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-
-# main parser for pre-2001. simpler because the data has been hacked so much to fit it
+ #print("\n", tripcave, "--- ppp", trippeople, len(triptext))
+ ltriptext = re.sub(r"</p>", "", triptext)
+ ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
+ trippeople=trippeople, expedition=expedition, logtime_underground=0,
+ entry_type="html")
+ if logbook_entry_count == 0:
+ print(" - No trip entrys found in logbook, check the syntax matches htmltxt format")
+
+
+# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
- tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+ tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
assert s, trippara[:300]
tripheader, triptext = s.group(1), s.group(2)
- mtripid = re.search('<a id="(.*?)"', tripheader)
+ mtripid = re.search(r'<a id="(.*?)"', tripheader)
tripid = mtripid and mtripid.group(1) or ""
- tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
+ tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
#print " ", [tripheader]
#continue
@@ -194,7 +209,7 @@ def Parseloghtml01(year, expedition, txt):
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
- mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
+ mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
@@ -206,38 +221,40 @@ def Parseloghtml01(year, expedition, txt):
ltriptext = triptext
- mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+ mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
#print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)]
- ltriptext = re.sub("</p>", "", ltriptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
+ ltriptext = re.sub(r"</p>", "", ltriptext)
+ ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
- ltriptext = re.sub("</?u>", "_", ltriptext)
- ltriptext = re.sub("</?i>", "''", ltriptext)
- ltriptext = re.sub("</?b>", "'''", ltriptext)
+ ltriptext = re.sub(r"</?u>", "_", ltriptext)
+ ltriptext = re.sub(r"</?i>", "''", ltriptext)
+ ltriptext = re.sub(r"</?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
- EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
+ EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
+ trippeople=trippeople, expedition=expedition, logtime_underground=0,
+ entry_type="html")
+# parser for 2003
def Parseloghtml03(year, expedition, txt):
- tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
+ tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
assert s, trippara
tripheader, triptext = s.group(1), s.group(2)
- tripheader = re.sub("&nbsp;", " ", tripheader)
- tripheader = re.sub("\s+", " ", tripheader).strip()
+ tripheader = re.sub(r"&nbsp;", " ", tripheader)
+ tripheader = re.sub(r"\s+", " ", tripheader).strip()
sheader = tripheader.split(" -- ")
tu = ""
if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop()
if len(sheader) != 3:
- print "header not three pieces", sheader
+ print("header not three pieces", sheader)
tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ")
@@ -246,37 +263,14 @@ def Parseloghtml03(year, expedition, txt):
else:
tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
- ltriptext = re.sub("</p>", "", triptext)
- ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
- ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
- EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-
-yearlinks = [
-# ("2013", "2013/logbook.html", Parseloghtmltxt),
- ("2012", "2012/logbook.html", Parseloghtmltxt),
- ("2011", "2011/logbook.html", Parseloghtmltxt),
- ("2010", "2010/logbook.html", Parselogwikitxt),
- ("2009", "2009/2009logbook.txt", Parselogwikitxt),
- ("2008", "2008/2008logbook.txt", Parselogwikitxt),
- ("2007", "2007/logbook.html", Parseloghtmltxt),
- ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
- ("2005", "2005/logbook.html", Parseloghtmltxt),
- ("2004", "2004/logbook.html", Parseloghtmltxt),
- ("2003", "2003/logbook.html", Parseloghtml03),
- ("2002", "2002/logbook.html", Parseloghtmltxt),
- ("2001", "2001/log.htm", Parseloghtml01),
- ("2000", "2000/log.htm", Parseloghtml01),
- ("1999", "1999/log.htm", Parseloghtml01),
- ("1998", "1998/log.htm", Parseloghtml01),
- ("1997", "1997/log.htm", Parseloghtml01),
- ("1996", "1996/log.htm", Parseloghtml01),
- ("1995", "1995/log.htm", Parseloghtml01),
- ("1994", "1994/log.htm", Parseloghtml01),
- ("1993", "1993/log.htm", Parseloghtml01),
- ("1992", "1992/log.htm", Parseloghtml01),
- ("1991", "1991/log.htm", Parseloghtml01),
- ]
+ ltriptext = re.sub(r"</p>", "", triptext)
+ ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+ ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
+ ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+ EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
+ text = ltriptext, trippeople=trippeople, expedition=expedition,
+ logtime_underground=0, entry_type="html")
+
def SetDatesFromLogbookEntries(expedition):
"""
@@ -295,54 +289,67 @@ def SetDatesFromLogbookEntries(expedition):
persontrip.persontrip_next = None
lprevpersontrip = persontrip
persontrip.save()
-
-
-
+
+
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- year = str(expedition.year)
- for lyear, lloc, parsefunc in yearlinks:
- if lyear == year:
- break
- fin = open(os.path.join(expowebbase, lloc))
- print "opennning", lloc
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
- return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
+ expowebbase = os.path.join(settings.EXPOWEB, "years")
+ yearlinks = settings.LOGBOOK_PARSER_SETTINGS
+
+ logbook_parseable = False
+
+ if expedition.year in yearlinks:
+ year_settings = yearlinks[expedition.year]
+ file_in = open(os.path.join(expowebbase, year_settings[0]))
+ txt = file_in.read().decode("latin1")
+ file_in.close()
+ parsefunc = year_settings[1]
+ logbook_parseable = True
+ print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
+ else:
+ try:
+ file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
+ txt = file_in.read().decode("latin1")
+ file_in.close()
+ logbook_parseable = True
+ print("No set parser found using default")
+ parsefunc = settings.DEFAULT_LOGBOOK_PARSER
+ except (IOError):
+ logbook_parseable = False
+ print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)
+
+ if logbook_parseable:
+ parser = globals()[parsefunc]
+ parser(expedition.year, expedition, txt)
+ SetDatesFromLogbookEntries(expedition)
+
+ #return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
def LoadLogbooks():
- """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
-
- #Deletion has been moved to a seperate function to enable the non-destructive importing
- #models.LogbookEntry.objects.all().delete()
- expowebbase = os.path.join(settings.EXPOWEB, "years")
- #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
- #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
-
- for year, lloc, parsefunc in yearlinks:
- # This will not work until the corresponding year exists in the database.
- # In 2012 this needed noscript/folk.csv to be updated first.
- expedition = models.Expedition.objects.filter(year = year)[0]
- fin = open(os.path.join(expowebbase, lloc))
- txt = fin.read().decode("latin1")
- fin.close()
- parsefunc(year, expedition, txt)
- SetDatesFromLogbookEntries(expedition)
+ """ This is the master function for parsing all logbooks into the Troggle database. """
+
+ # Clear the logbook data issues as we are reloading
+ models.DataIssue.objects.filter(parser='logbooks').delete()
+ # Fetch all expos
+ expos = models.Expedition.objects.all()
+ for expo in expos:
+ print("\nLoading Logbook for: " + expo.year)
+
+ # Load logbook for expo
+ LoadLogbookForExpedition(expo)
+
-dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
-expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
-titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
-reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
-personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
-nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
-TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
-locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
-caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
+dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
+expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
+titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
+reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
+personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
+nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
+TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
+locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
+caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
def parseAutoLogBookEntry(filename):
errors = []
@@ -435,4 +442,4 @@ def parseAutoLogBookEntry(filename):
time_underground = TU,
logbook_entry = logbookEntry,
is_logbook_entry_author = author).save()
- print logbookEntry
+ print(logbookEntry)
diff --git a/parsers/people.py b/parsers/people.py
index 4dba3a8..34a5ff3 100644
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -4,6 +4,8 @@ from django.conf import settings
import troggle.core.models as models
import csv, re, datetime, os, shutil
from utils import save_carefully
+from HTMLParser import HTMLParser
+from unidecode import unidecode
def saveMugShot(mugShotPath, mugShotFilename, person):
if mugShotFilename.startswith(r'i/'): #if filename in cell has the directory attached (I think they all do), remove it
@@ -44,13 +46,13 @@ def parseMugShotAndBlurb(personline, header, person):
def LoadPersonsExpos():
- persontab = open(os.path.join(settings.EXPOWEB, "noinfo", "folk.csv"))
+ persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
personreader = csv.reader(persontab)
headers = personreader.next()
header = dict(zip(headers, range(len(headers))))
# make expeditions
- print "Loading expeditions"
+ print("Loading expeditions")
years = headers[5:]
for year in years:
@@ -59,20 +61,35 @@ def LoadPersonsExpos():
save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs)
-
# make persons
- print "Loading personexpeditions"
+ print("Loading personexpeditions")
for personline in personreader:
name = personline[header["Name"]]
- name = re.sub("<.*?>", "", name)
- mname = re.match("(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name)
- nickname = mname.group(3) or ""
-
- lookupAttribs={'first_name':mname.group(1), 'last_name':(mname.group(2) or "")}
- nonLookupAttribs={'is_vfho':personline[header["VfHO member"]],}
+ name = re.sub(r"<.*?>", "", name)
+
+ firstname = ""
+ nickname = ""
+
+ rawlastname = personline[header["Lastname"]].strip()
+ matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
+ lastname = matchlastname.group(1).strip()
+
+ splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
+ fullname = splitnick.group(1)
+
+ nickname = splitnick.group(2) or ""
+
+ fullname = fullname.strip()
+ names = fullname.split(' ')
+ firstname = names[0]
+ if len(names) == 1:
+ lastname = ""
+
+ lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
+ nonLookupAttribs={'is_vfho':personline[header["VfHO member"]], 'fullname':fullname}
person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs)
-
+
parseMugShotAndBlurb(personline=personline, header=header, person=person)
# make person expedition from table
@@ -83,7 +100,26 @@ def LoadPersonsExpos():
nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs)
-
+
+ # this fills in those people for whom 2008 was their first expo
+ #print "Loading personexpeditions 2008"
+ #expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",")
+ #expomissing = set(expoers2008)
+ #for name in expomissing:
+ # firstname, lastname = name.split()
+ # is_guest = name in ["Eeva Makiranta", "Keith Curtis"]
+ # print "2008:", name
+ # persons = list(models.Person.objects.filter(first_name=firstname, last_name=lastname))
+ # if not persons:
+ # person = models.Person(first_name=firstname, last_name = lastname, is_vfho = False, mug_shot = "")
+ # #person.Sethref()
+ # person.save()
+ # else:
+ # person = persons[0]
+ # expedition = models.Expedition.objects.get(year="2008")
+ # personexpedition = models.PersonExpedition(person=person, expedition=expedition, nickname="", is_guest=is_guest)
+ # personexpedition.save()
+
# used in other referencing parser functions
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
@@ -96,20 +132,33 @@ def GetPersonExpeditionNameLookup(expedition):
res = { }
duplicates = set()
- print "Calculating GetPersonExpeditionNameLookup for", expedition.year
+ print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
+ htmlparser = HTMLParser()
for personexpedition in personexpeditions:
possnames = [ ]
- f = personexpedition.person.first_name.lower()
- l = personexpedition.person.last_name.lower()
+ f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
+ l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
+ full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
if l:
possnames.append(f + " " + l)
possnames.append(f + " " + l[0])
possnames.append(f + l[0])
possnames.append(f[0] + " " + l)
possnames.append(f)
- if personexpedition.nickname:
+ if full not in possnames:
+ possnames.append(full)
+ if personexpedition.nickname not in possnames:
possnames.append(personexpedition.nickname.lower())
+ if l:
+ # This allows for nickname to be used for short name eg Phil
+ # adding Phil Sargent to the list
+ if str(personexpedition.nickname.lower() + " " + l) not in possnames:
+ possnames.append(personexpedition.nickname.lower() + " " + l)
+ if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
+ possnames.append(personexpedition.nickname.lower() + " " + l[0])
+ if str(personexpedition.nickname.lower() + l[0]) not in possnames:
+ possnames.append(personexpedition.nickname.lower() + l[0])
for possname in possnames:
if possname in res:
diff --git a/parsers/peopleM.py b/parsers/peopleM.py
deleted file mode 100644
index 62c7ce0..0000000
--- a/parsers/peopleM.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from django.conf import settings
-import troggle.core.models as models
-
-def load():
- folkfile = open(settings.EXPOWEB+"noinfo/folk.csv")
- personlines = folkfile.read().splitlines()
- persontable = [x.split(',') for x in personlines]
- years = [persontable[0][i] for i in range(5,len(persontable[0]))]
- for year in years:
- newexpedition = models.ExpeditionM( date = year )
- newexpedition.save()
- for row in persontable[1:]: #skip header
- attendedid = [i for i, x in enumerate(row) if '1' in x]
- attendedyears = [persontable[0][i] for i in attendedid if i >= 5]
- name = row[0]
- print(name+' has attended: '+', '.join(attendedyears))
- newperson = models.PersonM(
- name = name)
- newperson.save()
- for year in attendedyears:
- target = models.ExpeditionM.objects.get(date=year)
- newperson.expos_attended.add( target )
- print('Person -> Expo table created!')
-
-
-
-
diff --git a/parsers/survex.py b/parsers/survex.py
index 536314f..14bd035 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -5,20 +5,26 @@ import troggle.settings as settings
from subprocess import call, Popen, PIPE
from troggle.parsers.people import GetPersonExpeditionNameLookup
+from django.utils.timezone import get_current_timezone
+from django.utils.timezone import make_aware
+
import re
import os
+from datetime import datetime
+line_leg_regex = re.compile(r"[\d\-+.]+$")
-def LoadSurvexLineLeg(survexblock, stardata, sline, comment):
+def LoadSurvexLineLeg(survexblock, stardata, sline, comment, cave):
+ # The try catches here need replacing as they are relativly expensive
ls = sline.lower().split()
ssfrom = survexblock.MakeSurvexStation(ls[stardata["from"]])
ssto = survexblock.MakeSurvexStation(ls[stardata["to"]])
-
+
survexleg = models.SurvexLeg(block=survexblock, stationfrom=ssfrom, stationto=ssto)
if stardata["type"] == "normal":
try:
survexleg.tape = float(ls[stardata["tape"]])
- except ValueError:
+ except ValueError:
print("Tape misread in", survexblock.survexfile.path)
print("Stardata:", stardata)
print("Line:", ls)
@@ -53,14 +59,17 @@ def LoadSurvexLineLeg(survexblock, stardata, sline, comment):
survexleg.compass = 1000
survexleg.clino = -90.0
else:
- assert re.match(r"[\d\-+.]+$", lcompass), ls
- assert re.match(r"[\d\-+.]+$", lclino) and lclino != "-", ls
+ assert line_leg_regex.match(lcompass), ls
+ assert line_leg_regex.match(lclino) and lclino != "-", ls
survexleg.compass = float(lcompass)
survexleg.clino = float(lclino)
-
+
+ if cave:
+ survexleg.cave = cave
+
# only save proper legs
survexleg.save()
-
+
itape = stardata.get("tape")
if itape:
try:
@@ -80,96 +89,212 @@ def LoadSurvexEquate(survexblock, sline):
def LoadSurvexLinePassage(survexblock, stardata, sline, comment):
pass
-
stardatadefault = {"type":"normal", "t":"leg", "from":0, "to":1, "tape":2, "compass":3, "clino":4}
stardataparamconvert = {"length":"tape", "bearing":"compass", "gradient":"clino"}
+regex_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
+regex_ref = re.compile(r'.*?ref.*?(\d+)\s*#\s*(\d+)')
+regex_star = re.compile(r'\s*\*[\s,]*(\w+)\s*(.*?)\s*(?:;.*)?$')
+regex_team = re.compile(r"(Insts|Notes|Tape|Dog|Useless|Pics|Helper|Disto|Consultant)\s+(.*)$(?i)")
+regex_team_member = re.compile(r" and | / |, | & | \+ |^both$|^none$(?i)")
+regex_qm = re.compile(r'^\s*QM(\d)\s+?([a-dA-DxX])\s+([\w\-]+)\.(\d+)\s+(([\w\-]+)\.(\d+)|\-)\s+(.+)$')
+
def RecursiveLoad(survexblock, survexfile, fin, textlines):
iblankbegins = 0
text = [ ]
stardata = stardatadefault
teammembers = [ ]
-
-# uncomment to print out all files during parsing
- print("Reading file:", survexblock.survexfile.path)
- while True:
- svxline = fin.readline().decode("latin1")
- if not svxline:
- return
- textlines.append(svxline)
-
+
+ # uncomment to print out all files during parsing
+ print(" - Reading file: " + survexblock.survexfile.path)
+ stamp = datetime.now()
+ lineno = 0
+
+ # Try to find the cave in the DB if not use the string as before
+ path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", survexblock.survexfile.path)
+ if path_match:
+ pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2))
+ # print('Match')
+ # print(pos_cave)
+ cave = models.getCaveByReference(pos_cave)
+ if cave:
+ survexfile.cave = cave
+ svxlines = ''
+ svxlines = fin.read().splitlines()
+ # print('Cave - preloop ' + str(survexfile.cave))
+ # print(survexblock)
+ for svxline in svxlines:
+
+ # print(survexblock)
+
+ # print(svxline)
+ # if not svxline:
+ # print(' - Not survex')
+ # return
+ # textlines.append(svxline)
+
+ lineno += 1
+
+ # print(' - Line: %d' % lineno)
+
# break the line at the comment
- sline, comment = re.match(r"([^;]*?)\s*(?:;\s*(.*))?\n?$", svxline.strip()).groups()
-
+ sline, comment = regex_comment.match(svxline.strip()).groups()
# detect ref line pointing to the scans directory
- mref = comment and re.match(r'.*?ref.*?(\d+)\s*#\s*(\d+)', comment)
+ mref = comment and regex_ref.match(comment)
if mref:
refscan = "%s#%s" % (mref.group(1), mref.group(2))
survexscansfolders = models.SurvexScansFolder.objects.filter(walletname=refscan)
if survexscansfolders:
survexblock.survexscansfolder = survexscansfolders[0]
#survexblock.refscandir = "%s/%s%%23%s" % (mref.group(1), mref.group(1), mref.group(2))
- survexblock.save()
+ survexblock.save()
continue
-
+
+ # This whole section should be moved if we can have *QM become a proper survex command
+ # Spec of QM in SVX files, currently commented out need to add to survex
+ # needs to match regex_qm
+ # ;Serial number grade(A/B/C/D/X) nearest-station resolution-station description
+ # ;QM1 a hobnob_hallway_2.42 hobnob-hallway_3.42 junction of keyhole passage
+ # ;QM1 a hobnob_hallway_2.42 - junction of keyhole passage
+ qmline = comment and regex_qm.match(comment)
+ if qmline:
+ print(qmline.groups())
+ #(u'1', u'B', u'miraclemaze', u'1.17', u'-', None, u'\tcontinuation of rift')
+ qm_no = qmline.group(1)
+ qm_grade = qmline.group(2)
+ qm_from_section = qmline.group(3)
+ qm_from_station = qmline.group(4)
+ qm_resolve_section = qmline.group(6)
+ qm_resolve_station = qmline.group(7)
+ qm_notes = qmline.group(8)
+
+ print('Cave - %s' % survexfile.cave)
+ print('QM no %d' % int(qm_no))
+ print('QM grade %s' % qm_grade)
+ print('QM section %s' % qm_from_section)
+ print('QM station %s' % qm_from_station)
+ print('QM res section %s' % qm_resolve_section)
+ print('QM res station %s' % qm_resolve_station)
+ print('QM notes %s' % qm_notes)
+
+ # If the QM isn't resolved (has a resolving station) thn load it
+ if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None':
+ from_section = models.SurvexBlock.objects.filter(name=qm_from_section)
+ # If we can find a section (survex note chunck, named)
+ if len(from_section) > 0:
+ print(from_section[0])
+ from_station = models.SurvexStation.objects.filter(block=from_section[0], name=qm_from_station)
+ # If we can find a from station then we have the nearest station and can import it
+ if len(from_station) > 0:
+ print(from_station[0])
+ qm = models.QM.objects.create(number=qm_no,
+ nearest_station=from_station[0],
+ grade=qm_grade.upper(),
+ location_description=qm_notes)
+ else:
+ print('QM found but resolved')
+
+ #print('Cave -sline ' + str(cave))
if not sline:
continue
-
+
# detect the star command
- mstar = re.match(r'\s*\*[\s,]*(\w+)\s*(.*?)\s*(?:;.*)?$', sline)
+ mstar = regex_star.match(sline)
if not mstar:
if "from" in stardata:
- LoadSurvexLineLeg(survexblock, stardata, sline, comment)
+ # print('Cave ' + str(survexfile.cave))
+ # print(survexblock)
+ LoadSurvexLineLeg(survexblock, stardata, sline, comment, survexfile.cave)
+ # print(' - From: ')
+ #print(stardata)
+ pass
elif stardata["type"] == "passage":
LoadSurvexLinePassage(survexblock, stardata, sline, comment)
+ # print(' - Passage: ')
#Missing "station" in stardata.
continue
-
+
# detect the star command
cmd, line = mstar.groups()
cmd = cmd.lower()
if re.match("include$(?i)", cmd):
includepath = os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", line))
- includesurvexfile = models.SurvexFile(path=includepath, cave=survexfile.cave)
+ print(' - Include file found including - ' + includepath)
+ # Try to find the cave in the DB if not use the string as before
+ path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath)
+ if path_match:
+ pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2))
+ # print(pos_cave)
+ cave = models.getCaveByReference(pos_cave)
+ if cave:
+ survexfile.cave = cave
+ else:
+ print('No match for %s' % includepath)
+ includesurvexfile = models.SurvexFile(path=includepath)
includesurvexfile.save()
includesurvexfile.SetDirectory()
if includesurvexfile.exists():
+ survexblock.save()
fininclude = includesurvexfile.OpenFile()
RecursiveLoad(survexblock, includesurvexfile, fininclude, textlines)
-
+
elif re.match("begin$(?i)", cmd):
- if line:
+ if line:
+ newsvxpath = os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", line))
+ # Try to find the cave in the DB if not use the string as before
+ path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", newsvxpath)
+ if path_match:
+ pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2))
+ print(pos_cave)
+ cave = models.getCaveByReference(pos_cave)
+ if cave:
+ survexfile.cave = cave
+ else:
+ print('No match for %s' % newsvxpath)
+
name = line.lower()
- survexblockdown = models.SurvexBlock(name=name, begin_char=fin.tell(), parent=survexblock, survexpath=survexblock.survexpath+"."+name, cave=survexblock.cave, survexfile=survexfile, totalleglength=0.0)
+ print(' - Begin found for: ' + name)
+ # print('Block cave: ' + str(survexfile.cave))
+ survexblockdown = models.SurvexBlock(name=name, begin_char=fin.tell(), parent=survexblock, survexpath=survexblock.survexpath+"."+name, cave=survexfile.cave, survexfile=survexfile, totalleglength=0.0)
survexblockdown.save()
+ survexblock.save()
+ survexblock = survexblockdown
+ # print(survexblockdown)
textlinesdown = [ ]
RecursiveLoad(survexblockdown, survexfile, fin, textlinesdown)
else:
iblankbegins += 1
-
+
elif re.match("end$(?i)", cmd):
if iblankbegins:
iblankbegins -= 1
else:
survexblock.text = "".join(textlines)
survexblock.save()
+ # print(' - End found: ')
+ endstamp = datetime.now()
+ timetaken = endstamp - stamp
+ # print(' - Time to process: ' + str(timetaken))
return
-
+
elif re.match("date$(?i)", cmd):
if len(line) == 10:
- survexblock.date = re.sub(r"\.", "-", line)
+ #print(' - Date found: ' + line)
+ survexblock.date = make_aware(datetime.strptime(re.sub(r"\.", "-", line), '%Y-%m-%d'), get_current_timezone())
expeditions = models.Expedition.objects.filter(year=line[:4])
if expeditions:
assert len(expeditions) == 1
survexblock.expedition = expeditions[0]
survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date)
survexblock.save()
-
+
elif re.match("team$(?i)", cmd):
- mteammember = re.match(r"(Insts|Notes|Tape|Dog|Useless|Pics|Helper|Disto|Consultant)\s+(.*)$(?i)", line)
+ pass
+ # print(' - Team found: ')
+ mteammember = regex_team.match(line)
if mteammember:
- for tm in re.split(r" and | / |, | & | \+ |^both$|^none$(?i)", mteammember.group(2)):
+ for tm in regex_team_member.split(mteammember.group(2)):
if tm:
personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower())
if (personexpedition, tm) not in teammembers:
@@ -179,18 +304,23 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines):
if personexpedition:
personrole.person=personexpedition.person
personrole.save()
-
+
elif cmd == "title":
- survextitle = models.SurvexTitle(survexblock=survexblock, title=line.strip('"'), cave=survexblock.cave)
+ #print(' - Title found: ')
+ survextitle = models.SurvexTitle(survexblock=survexblock, title=line.strip('"'), cave=survexfile.cave)
survextitle.save()
-
+ pass
+
elif cmd == "require":
# should we check survex version available for processing?
pass
elif cmd == "data":
+ #print(' - Data found: ')
ls = line.lower().split()
stardata = { "type":ls[0] }
+ #print(' - Star data: ', stardata)
+ #print(ls)
for i in range(0, len(ls)):
stardata[stardataparamconvert.get(ls[i], ls[i])] = i - 1
if ls[0] in ["normal", "cartesian", "nosurvey"]:
@@ -199,40 +329,23 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines):
stardata = stardatadefault
else:
assert ls[0] == "passage", line
-
+
elif cmd == "equate":
+ #print(' - Equate found: ')
LoadSurvexEquate(survexblock, line)
elif cmd == "fix":
+ #print(' - Fix found: ')
survexblock.MakeSurvexStation(line.split()[0])
else:
+ #print(' - Stuff')
if cmd not in ["sd", "include", "units", "entrance", "data", "flags", "title", "export", "instrument",
"calibrate", "set", "infer", "alias", "ref", "cs", "declination", "case"]:
print("Unrecognised command in line:", cmd, line, survexblock, survexblock.survexfile.path)
-
-
-def ReloadSurvexCave(survex_cave, area):
- print(survex_cave, area)
- cave = models.Cave.objects.get(kataster_number=survex_cave, area__short_name=area)
- print(cave)
- #cave = models.Cave.objects.get(kataster_number=survex_cave)
- cave.survexblock_set.all().delete()
- cave.survexfile_set.all().delete()
- cave.survexdirectory_set.all().delete()
-
- survexfile = models.SurvexFile(path="caves-" + cave.kat_area() + "/" + survex_cave + "/" + survex_cave, cave=cave)
- survexfile.save()
- survexfile.SetDirectory()
-
- survexblockroot = models.SurvexBlock(name="root", survexpath="caves-" + cave.kat_area(), begin_char=0, cave=cave, survexfile=survexfile, totalleglength=0.0)
- survexblockroot.save()
- fin = survexfile.OpenFile()
- textlines = [ ]
- RecursiveLoad(survexblockroot, survexfile, fin, textlines)
- survexblockroot.text = "".join(textlines)
- survexblockroot.save()
-
+ endstamp = datetime.now()
+ timetaken = endstamp - stamp
+ # print(' - Time to process: ' + str(timetaken))
def LoadAllSurvexBlocks():
@@ -249,7 +362,7 @@ def LoadAllSurvexBlocks():
print(" - Data flushed")
- survexfile = models.SurvexFile(path="all", cave=None)
+ survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None)
survexfile.save()
survexfile.SetDirectory()
@@ -258,22 +371,13 @@ def LoadAllSurvexBlocks():
survexblockroot.save()
fin = survexfile.OpenFile()
textlines = [ ]
+ # The real work starts here
RecursiveLoad(survexblockroot, survexfile, fin, textlines)
+ fin.close()
survexblockroot.text = "".join(textlines)
survexblockroot.save()
- #Load each cave,
- #FIXME this should be dealt with load all above
- print(" - Reloading all caves")
- caves = models.Cave.objects.all()
- for cave in caves:
- if cave.kataster_number and os.path.isdir(os.path.join(settings.SURVEX_DATA, "caves-" + cave.kat_area(), cave.kataster_number)):
- if cave.kataster_number not in ['40']:
- print("loading", cave, cave.kat_area())
- ReloadSurvexCave(cave.kataster_number, cave.kat_area())
-
-
poslineregex = re.compile(r"^\(\s*([+-]?\d*\.\d*),\s*([+-]?\d*\.\d*),\s*([+-]?\d*\.\d*)\s*\)\s*([^\s]+)$")
@@ -281,12 +385,12 @@ def LoadPos():
print('Loading Pos....')
- call([settings.CAVERN, "--output=%s/all.3d" % settings.SURVEX_DATA, "%s/all.svx" % settings.SURVEX_DATA])
- call([settings.THREEDTOPOS, '%sall.3d' % settings.SURVEX_DATA], cwd = settings.SURVEX_DATA)
- posfile = open("%sall.pos" % settings.SURVEX_DATA)
+ call([settings.CAVERN, "--output=%s%s.3d" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME), "%s%s.svx" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME)])
+ call([settings.THREEDTOPOS, '%s%s.3d' % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME)], cwd = settings.SURVEX_DATA)
+ posfile = open("%s%s.pos" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME))
posfile.readline() #Drop header
for line in posfile.readlines():
- r = poslineregex.match(line)
+ r = poslineregex.match(line)
if r:
x, y, z, name = r.groups()
try:
diff --git a/parsers/surveys.py b/parsers/surveys.py
index 72a05f3..efab536 100644
--- a/parsers/surveys.py
+++ b/parsers/surveys.py
@@ -99,7 +99,7 @@ def parseSurveyScans(expedition, logfile=None):
#scanList = listdir(expedition.year, surveyFolder)
scanList=os.listdir(os.path.join(yearPath,surveyFolder))
except AttributeError:
- print(surveyFolder + " ignored\r",)
+ print("Folder: " + surveyFolder + " ignored\r")
continue
for scan in scanList:
@@ -107,7 +107,7 @@ def parseSurveyScans(expedition, logfile=None):
scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups()
scanType,scanNumber,scanFormat=scanChopped
except AttributeError:
- print(scan + " ignored\r",)
+ print("File: " + scan + " ignored\r")
continue
if scanType == 'elev' or scanType == 'extend':
scanType = 'elevation'
@@ -174,9 +174,6 @@ def GetListDir(sdir):
ff = os.path.join(sdir, f)
res.append((f, ff, os.path.isdir(ff)))
return res
-
-
-
def LoadListScansFile(survexscansfolder):
diff --git a/parsers/surveysM.py b/parsers/surveysM.py
deleted file mode 100644
index 2b94b02..0000000
--- a/parsers/surveysM.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from django.conf import settings
-import subprocess, re
-import troggle.core.models as models
-
-def load():
- print('Load survex files and relations')
- load_area('1623')
-
-def load_area(areacode):
-
- print('Searching all cave dirs files')
- basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'
-
- cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
- print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
-
- for cavedir in cavedirs:
- if cavedir==basedir:
- continue #skip the basedir - a non-proper subdirectory
- parentname = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
- parentcave = models.CaveM.objects.filter(survex_file__icontains=cavedir)
- if len(parentcave)>1:
- print('Non unique parent - skipping. Name:'+parentname)
- elif len(parentcave)==0:
- print('Error! parent not created:'+parentname)
- continue
- else: #exaclty one match
- print('Adding relations of:'+parentname)
- parentcave = parentcave[0]
-
- surveyfiles = bash('find '+cavedir+' -name \'*.svx\'').splitlines()
- for fn in surveyfiles:
- print(fn)
- svxcontents = open(fn,'r').read().splitlines()
- try:
- dateline = [x for x in svxcontents if ('*date' in x)][0]
- date = re.findall('\\d\\d\\d\\d\\.\\d\\d\\.\\d\\d', dateline, re.S)[0]
-
-
- except:
- if( len( [x for x in svxcontents if ('*date' in x)] ) == 0 ):
- continue #skip dateless files
- print('Date format error in '+fn)
- print('Dateline = '+ '"'.join([x for x in svxcontents if ('*date' in x)]))
- date = '1900.01.01'
-
-
- newsurvex = models.SurveyM(survex_file=fn, date=date)
- newsurvex.save()
- parentcave.surveys.add(newsurvex)
- parentcave.save()
-
-
-def file_exists(filename):
- test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence
- if 'MISSING' in test: #send error message to the database
- return False
- return True
-
-def bash(cmd): #calls command in bash shell, returns output
- process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
- output, error = process.communicate()
- return output
-
-