diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/cavesM.py | 129 | ||||
-rw-r--r-- | parsers/people.py | 33 |
2 files changed, 130 insertions, 32 deletions
diff --git a/parsers/cavesM.py b/parsers/cavesM.py new file mode 100644 index 0000000..13cd5d5 --- /dev/null +++ b/parsers/cavesM.py @@ -0,0 +1,129 @@ + +import troggle.core.models as models #import models for various objects +from django.conf import settings +import xml.etree.ElementTree as ET #this is used to parse XML's +import subprocess +import re + +# +# This parser has to find several things: +# There are files of .html format in expoweb area - they contain some of the important information +# There is a similar number of .svx files in loser are - they contain all the measurements +# +# Previous version was incredibly slow due to various shitty ideas about finding things +# and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell +# and handles more sophisticated bits only +# + +def load(): + print('Hi! I\'m caves parser. Ready to work') + + print('Loading caves of 1623 area') + loadarea('caves-1623/') + + +def loadarea(areacode): + + + print('Searching all cave dirs files') + basedir = settings.SURVEX_DATA+areacode + + bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx') + + cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories + print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')') + ndirs = len(cavedirs) #remember number of dirs for nice debug output + + for cavedir in cavedirs: + if cavedir==basedir: + continue #skip the basedir - a non-proper subdirectory + cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory + + test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence + if 'MISSING' in test: #send error message to the database + msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn') + print('Cave missing'+cavename+' :(') + msg.save() + continue + fullname=cavedir+'/'+cavename+'.svx' + print('Found cave:'+cavename) + cavernout = bash('cavern -q '+fullname) #make cavern process the thing + if 'cavern: error:' in cavernout: + msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn') + print('Fucked svx'+cavename+' :(') + msg.save() + continue + + cavernout = cavernout.splitlines() + depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2]) + length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1]) + surveyname = bash('cat '+fullname+' | grep \'\*begin\' | head -n1 | cut -f2 -d \' \' ').splitlines().pop() + title = (bash('cat '+fullname+' | grep \'\*title\' | head -n1 | cut -f2 -d \' \' ').splitlines() or ["Not found"])[0] + print((('depth','length','surv name'),(depth,length,surveyname))) + print('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[\\.'+surveyname+'.*\\]\'') + nodes = bash('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[.*\\.'+surveyname+'.*\\]\'').splitlines() + entran = [x for x in nodes if ('ENTRANCE' in x) ] + print(nodes) + + + newcave = models.CaveM(survex_file = fullname, total_length = length, name=title, total_depth = depth) + newcave.save() + #end of reading survex masterfiles + + print ("Reading cave descriptions") + cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines() + for fn in cavefiles: + f = open(fn, "r") + print(fn) + contents = f.read() + + desc = extractXML(contents,'underground_description') + name = re.search(r'>.*<',extractXML(contents,'caveslug')).group()[6:-1] + + if desc==None or name==None: + msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn') + print('Fucked description '+fn+' :(') + msg.save() + continue + + + updatecave = models.CaveM.objects.filter(survex_file__icontains='/'+name+'.svx') + if len(updatecave)>1: + print('Non unique solution - skipping. Name:'+name) + elif len(updatecave)==0: + print('Cave with no survex data'+name) + newcave = models.CaveM(description = desc, name = name) + newcave.save() + else: #exaclty one match + updatecave = updatecave[0] + updatecave.description = desc + if updatecave.name=="Not found": + updatecave.name=name + updatecave.title=name + updatecave.save() + + + #end of reading cave descriptions + + + +def extractXML(contents,tag): + #find correct lines + lines = contents.splitlines() + beg = [x for x in lines if ('<'+tag+'>' in x)] + end = [x for x in lines if ('</'+tag+'>' in x)] + if (not beg) or (not end): + return None + begi = lines.index(beg[0]) + endi = lines.index(end[0]) + if endi!=begi: + segment = '\n'.join(lines[begi:endi+1]) + else: + segment = lines[begi:endi+1] + return segment[0] + + +def bash(cmd): #calls command in bash shell, returns output + process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) + output, error = process.communicate() + return output diff --git a/parsers/people.py b/parsers/people.py index bc18472..4dba3a8 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -62,8 +62,6 @@ def LoadPersonsExpos(): # make persons print "Loading personexpeditions" - #expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",") - #expomissing = set(expoers2008) for personline in personreader: name = personline[header["Name"]] @@ -85,36 +83,7 @@ def LoadPersonsExpos(): nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")} save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs) - - # this fills in those people for whom 2008 was their first expo - #print "Loading personexpeditions 2008" - #for name in expomissing: - # firstname, lastname = name.split() - # is_guest = name in ["Eeva Makiranta", "Keith Curtis"] - # print "2008:", name - # persons = list(models.Person.objects.filter(first_name=firstname, last_name=lastname)) - # if not persons: - # person = models.Person(first_name=firstname, last_name = lastname, is_vfho = False, mug_shot = "") - # #person.Sethref() - # person.save() - # else: - # person = persons[0] - # expedition = models.Expedition.objects.get(year="2008") - # personexpedition = models.PersonExpedition(person=person, expedition=expedition, nickname="", is_guest=is_guest) - # personexpedition.save() - - #Notability is now a method of person. Makes no sense to store it in the database; it would need to be recalculated every time something changes. - AC 16 Feb 09 - # could rank according to surveying as well - #print "Setting person notability" - #for person in models.Person.objects.all(): - #person.notability = 0.0 - #for personexpedition in person.personexpedition_set.all(): - #if not personexpedition.is_guest: - #person.notability += 1.0 / (2012 - int(personexpedition.expedition.year)) - #person.bisnotable = person.notability > 0.3 # I don't know how to filter by this - #person.save() - - + # used in other referencing parser functions # expedition name lookup cached for speed (it's a very big list) Gpersonexpeditionnamelookup = { } |