diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/QMs.py | 20 | ||||
-rw-r--r-- | parsers/caves.py | 50 | ||||
-rw-r--r-- | parsers/cavesM.py | 213 | ||||
-rw-r--r-- | parsers/logbooks.py | 257 | ||||
-rw-r--r-- | parsers/people.py | 81 | ||||
-rw-r--r-- | parsers/peopleM.py | 27 | ||||
-rw-r--r-- | parsers/survex.py | 262 | ||||
-rw-r--r-- | parsers/surveys.py | 7 | ||||
-rw-r--r-- | parsers/surveysM.py | 65 |
9 files changed, 420 insertions, 562 deletions
diff --git a/parsers/QMs.py b/parsers/QMs.py index efc8cd6..602b7af 100644 --- a/parsers/QMs.py +++ b/parsers/QMs.py @@ -17,19 +17,19 @@ def parseCaveQMs(cave,inputFile): try: steinBr=Cave.objects.get(official_name="Steinbrückenhöhle") except Cave.DoesNotExist: - print "Steinbruckenhoehle is not in the database. Please run parsers.cavetab first." + print("Steinbruckenhoehle is not in the database. Please run parsers.cavetab first.") return elif cave=='hauch': try: hauchHl=Cave.objects.get(official_name="Hauchhöhle") except Cave.DoesNotExist: - print "Hauchhoele is not in the database. Please run parsers.cavetab first." + print("Hauchhoele is not in the database. Please run parsers.cavetab first.") return elif cave =='kh': try: kh=Cave.objects.get(official_name="Kaninchenhöhle") except Cave.DoesNotExist: - print "KH is not in the database. Please run parsers.cavetab first." + print("KH is not in the database. Please run parsers.cavetab first.") parse_KH_QMs(kh, inputFile=inputFile) return @@ -48,7 +48,7 @@ def parseCaveQMs(cave,inputFile): elif cave=='hauch': placeholder, hadToCreate = LogbookEntry.objects.get_or_create(date__year=year, title="placeholder for QMs in 234", text="QMs temporarily attached to this should be re-attached to their actual trips", defaults={"date": date(year, 1, 1),"cave":hauchHl}) if hadToCreate: - print cave+" placeholder logbook entry for " + str(year) + " added to database" + print(cave + " placeholder logbook entry for " + str(year) + " added to database") QMnum=re.match(r".*?-\d*?-X?(?P<numb>\d*)",line[0]).group("numb") newQM = QM() newQM.found_by=placeholder @@ -71,19 +71,18 @@ def parseCaveQMs(cave,inputFile): if preexistingQM.new_since_parsing==False: #if the pre-existing QM has not been modified, overwrite it preexistingQM.delete() newQM.save() - print "overwriting " + str(preexistingQM) +"\r", - + print("overwriting " + str(preexistingQM) +"\r") else: # otherwise, print that it was ignored - print "preserving "+ str(preexistingQM) + ", which was edited in admin \r", + print("preserving " + str(preexistingQM) + ", which was edited in admin \r") except QM.DoesNotExist: #if there is no pre-existing QM, save the new one newQM.save() - print "QM "+str(newQM) + ' added to database\r', + print("QM "+str(newQM) + ' added to database\r') except KeyError: #check on this one continue except IndexError: - print "Index error in " + str(line) + print("Index error in " + str(line)) continue def parse_KH_QMs(kh, inputFile): @@ -104,7 +103,7 @@ def parse_KH_QMs(kh, inputFile): } nonLookupArgs={ 'grade':res['grade'], - 'nearest_station':res['nearest_station'], + 'nearest_station_name':res['nearest_station'], 'location_description':res['description'] } @@ -115,3 +114,4 @@ parseCaveQMs(cave='stein',inputFile=r"1623/204/qm.csv") parseCaveQMs(cave='hauch',inputFile=r"1623/234/qm.csv") parseCaveQMs(cave='kh', inputFile="1623/161/qmtodo.htm") #parseCaveQMs(cave='balkonhoehle',inputFile=r"1623/264/qm.csv") + diff --git a/parsers/caves.py b/parsers/caves.py index ba1c358..606007f 100644 --- a/parsers/caves.py +++ b/parsers/caves.py @@ -6,16 +6,18 @@ import re def readcaves(): - newArea = models.Area(short_name = "1623", parent = None) - newArea.save() - newArea = models.Area(short_name = "1626", parent = None) - newArea.save() - print("Reading Entrances") + + # Clear the cave data issues as we are reloading + models.DataIssue.objects.filter(parser='caves').delete() + + area_1623 = models.Area.objects.update_or_create(short_name = "1623", parent = None) + area_1626 = models.Area.objects.update_or_create(short_name = "1626", parent = None) + print(" - Reading Entrances") #print "list of <Slug> <Filename>" for filename in os.walk(settings.ENTRANCEDESCRIPTIONS).next()[2]: #Should be a better way of getting a list of files if filename.endswith('.html'): readentrance(filename) - print ("Reading Caves") + print (" - Reading Caves") for filename in os.walk(settings.CAVEDESCRIPTIONS).next()[2]: #Should be a better way of getting a list of files if filename.endswith('.html'): readcave(filename) @@ -51,7 +53,7 @@ def readentrance(filename): bearings = getXML(entrancecontents, "bearings", maxItems = 1, context = context) url = getXML(entrancecontents, "url", maxItems = 1, context = context) if len(non_public) == 1 and len(slugs) >= 1 and len(name) >= 1 and len(entrance_description) == 1 and len(explorers) == 1 and len(map_description) == 1 and len(location_description) == 1 and len(approach) == 1 and len(underground_description) == 1 and len(marking) == 1 and len(marking_comment) == 1 and len(findability) == 1 and len(findability_description) == 1 and len(alt) == 1 and len(northing) == 1 and len(easting) == 1 and len(tag_station) == 1 and len(exact_station) == 1 and len(other_station) == 1 and len(other_description) == 1 and len(bearings) == 1 and len(url) == 1: - e = models.Entrance(name = name[0], + e, state = models.Entrance.objects.update_or_create(name = name[0], non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]], entrance_description = entrance_description[0], explorers = explorers[0], @@ -75,14 +77,12 @@ def readentrance(filename): url = url[0], filename = filename, cached_primary_slug = slugs[0]) - e.save() primary = True for slug in slugs: #print slug, filename - cs = models.EntranceSlug(entrance = e, + cs = models.EntranceSlug.objects.update_or_create(entrance = e, slug = slug, primary = primary) - cs.save() primary = False def readcave(filename): @@ -117,7 +117,7 @@ def readcave(filename): url = getXML(cavecontents, "url", maxItems = 1, context = context) entrances = getXML(cavecontents, "entrance", context = context) if len(non_public) == 1 and len(slugs) >= 1 and len(official_name) == 1 and len(areas) >= 1 and len(kataster_code) == 1 and len(kataster_number) == 1 and len(unofficial_number) == 1 and len(explorers) == 1 and len(underground_description) == 1 and len(equipment) == 1 and len(references) == 1 and len(survey) == 1 and len(kataster_status) == 1 and len(underground_centre_line) == 1 and len(notes) == 1 and len(length) == 1 and len(depth) == 1 and len(extent) == 1 and len(survex_file) == 1 and len(description_file ) == 1 and len(url) == 1 and len(entrances) >= 1: - c = models.Cave(non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]], + c, state = models.Cave.objects.update_or_create(non_public = {"True": True, "False": False, "true": True, "false": False,}[non_public[0]], official_name = official_name[0], kataster_code = kataster_code[0], kataster_number = kataster_number[0], @@ -137,7 +137,6 @@ def readcave(filename): description_file = description_file[0], url = url[0], filename = filename) - c.save() for area_slug in areas: area = models.Area.objects.filter(short_name = area_slug) if area: @@ -149,12 +148,13 @@ def readcave(filename): primary = True for slug in slugs: try: - cs = models.CaveSlug(cave = c, + cs = models.CaveSlug.objects.update_or_create(cave = c, slug = slug, primary = primary) - cs.save() except: - print("Can't find text (slug): %s, skipping %s" % (slug, context)) + message = "Can't find text (slug): %s, skipping %s" % (slug, context) + models.DataIssue.objects.create(parser='caves', message=message) + print(message) primary = False for entrance in entrances: @@ -162,20 +162,26 @@ def readcave(filename): letter = getXML(entrance, "letter", maxItems = 1, context = context)[0] try: entrance = models.Entrance.objects.get(entranceslug__slug = slug) - ce = models.CaveAndEntrance(cave = c, entrance_letter = letter, entrance = entrance) - ce.save() + ce = models.CaveAndEntrance.objects.update_or_create(cave = c, entrance_letter = letter, entrance = entrance) except: - print ("Entrance text (slug) %s missing %s" % (slug, context)) + message = "Entrance text (slug) %s missing %s" % (slug, context) + models.DataIssue.objects.create(parser='caves', message=message) + print(message) def getXML(text, itemname, minItems = 1, maxItems = None, printwarnings = True, context = ""): items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S) if len(items) < minItems and printwarnings: - print("%(count)i %(itemname)s found, at least %(min)i expected" % {"count": len(items), + message = "%(count)i %(itemname)s found, at least %(min)i expected" % {"count": len(items), "itemname": itemname, - "min": minItems} + context) + "min": minItems} + context + models.DataIssue.objects.create(parser='caves', message=message) + print(message) + if maxItems is not None and len(items) > maxItems and printwarnings: - print("%(count)i %(itemname)s found, no more than %(max)i expected" % {"count": len(items), + message = "%(count)i %(itemname)s found, no more than %(max)i expected" % {"count": len(items), "itemname": itemname, - "max": maxItems} + context) + "max": maxItems} + context + models.DataIssue.objects.create(parser='caves', message=message) + print(message) return items diff --git a/parsers/cavesM.py b/parsers/cavesM.py deleted file mode 100644 index f9900d6..0000000 --- a/parsers/cavesM.py +++ /dev/null @@ -1,213 +0,0 @@ - -import troggle.core.models as models #import models for various objects -from django.conf import settings -import xml.etree.ElementTree as ET #this is used to parse XML's -import subprocess -import re - -# -# This parser has to find several things: -# There are files of .html format in expoweb area - they contain some of the important information -# There is a similar number of .svx files in loser are - they contain all the measurements -# -# Previous version was incredibly slow due to various shitty ideas about finding things -# and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell -# and handles more sophisticated bits only -# - -def load(): - print('Hi! I\'m caves parser. Ready to work') - - print('Loading caves of 1623 area') - loadarea('1623') - - print('Loading caves of 1626 area') - loadarea('1626') - - -def loadarea(areacode): - - if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'): - print('Computing master .3d file') - bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx') - else: - print('Loading from existing master .3d file') - - master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines() - master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file - master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file - - print('Searching all cave dirs files') - basedir = settings.SURVEX_DATA+'caves-'+areacode+'/' - - cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories - print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')') - ndirs = len(cavedirs) #remember number of dirs for nice debug output - - for cavedir in cavedirs: - if cavedir==basedir: - continue #skip the basedir - a non-proper subdirectory - cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory - - test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence - if not file_exists(cavedir+'/'+cavename+'.svx'): - msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn') - print('Cave missing'+cavename+' :(') - msg.save() - continue - fullname=cavedir+'/'+cavename+'.svx' - print('Found cave:'+cavename) - cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing - if 'cavern: error:' in cavernout: - msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn') - print('Fucked svx'+cavename+' :(') - msg.save() - continue - - cavernout = cavernout.splitlines() - depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2]) - length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1]) - cavefile = open(fullname,'r') - cavefilecontents = cavefile.read().splitlines() - surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower() - try: - title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1] - except: - syrveyname = "Untitled" - - relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))] - entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x] - surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x] - location_nodes = [] - print('rel_nodes'+str(len(relevant_nodes))) - if len(entrance_nodes) > 0: - location_nodes = entrance_nodes - elif len(surface_nodes) > 0: - location_nodes = surface_nodes - elif len(relevant_nodes) > 0: - location_nodes = relevant_nodes - - try: - location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop() - except: - print(location_nodes) - location = 'Not found' - - relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))] - try: - lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop() - except: - lastleg = ['LINE 1900.01.01'] - try: - lastdate = lastleg.split().pop() - if 'STYLE' in lastdate: - lastdate = lastleg.split().pop().pop() - except: - lastdate = '1900.01.01' - - entrance = ' '.join(location.split()[1:3]) - print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print - - - newcave = models.CaveM( - survex_file = fullname, - total_length = length, - name=areacode+'.'+surveyname, - total_depth = depth, - date = lastdate, - entrance = entrance) - newcave.save() - #end of reading survex masterfiles - - print ("Reading cave descriptions") - cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines() - for fn in cavefiles: - f = open(fn, "r") - print(fn) - contents = f.read() - - slug = re.sub(r"\s+", "", extractXML(contents,'caveslug')) - desc = extractXML(contents,'underground_description') - name = slug[5:] #get survex compatible name - area = slug[0:4] - - print([area,name]) - - if desc==None or name==None: - msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn') - print('Fucked description '+fn+' :(') - msg.save() - continue - - print(area+'/'+name+'/'+name+'.svx') - - updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx') - if len(updatecave)>1: - print('Non unique solution - skipping. Name:'+name) - elif len(updatecave)==0: - print('Cave with no survex data:'+name) - continue - else: #exaclty one match - print('Adding desc:'+name) - updatecave = updatecave[0] - updatecave.description = '/cave/descriptionM/'+slug #area-name - updatecave.title=name - updatecave.save() - - slugS = slug - explorersS = extractXML(contents,'explorers') - underground_descriptionS = extractXML(contents,'underground_description') - equipmentS = extractXML(contents,'equipment') - referencesS = extractXML(contents,'references') - surveyS = extractXML(contents,'survey') - kataster_statusS = extractXML(contents,'kataster_status') - underground_centre_lineS = extractXML(contents,'underground_centre_line') - survex_fileS = extractXML(contents,'survex_file') - notesS = extractXML(contents,'notes') - - - newcavedesc = models.Cave_descriptionM( - slug = slugS, - explorers = explorersS, - underground_description = underground_descriptionS, - equipment = equipmentS, - references = referencesS, - survey = surveyS, - kataster_status = kataster_statusS, - underground_centre_line = underground_centre_lineS, - survex_file = survex_fileS, - notes = notesS) - newcavedesc.save() - - - - - #end of reading cave descriptions - -def file_exists(filename): - test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence - if 'MISSING' in test: #send error message to the database - return False - return True - -def extractXML(contents,tag): - #find correct lines - lines = contents.splitlines() - beg = [x for x in lines if ('<'+tag+'>' in x)] - end = [x for x in lines if ('</'+tag+'>' in x)] - if (not beg) or (not end): - return None - begi = lines.index(beg[0]) - endi = lines.index(end[0]) - if endi!=begi: - segment = '\n'.join(lines[begi:endi+1]) - else: - segment = lines[begi:endi+1][0] - - hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0] - return hit - -def bash(cmd): #calls command in bash shell, returns output - process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) - output, error = process.communicate() - return output diff --git a/parsers/logbooks.py b/parsers/logbooks.py index cb40f58..9dfa31b 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -7,6 +7,8 @@ from parsers.people import GetPersonExpeditionNameLookup from parsers.cavetab import GetCaveLookup from django.template.defaultfilters import slugify +from django.utils.timezone import get_current_timezone +from django.utils.timezone import make_aware import csv import re @@ -23,19 +25,23 @@ from utils import save_carefully # # the logbook loading section # -def GetTripPersons(trippeople, expedition, logtime_underground): +def GetTripPersons(trippeople, expedition, logtime_underground): res = [ ] author = None - for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople): + round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]") + for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() - mul = re.match("<u>(.*?)</u>$(?i)", tripperson) + mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson) if mul: tripperson = mul.group(1).strip() if tripperson and tripperson[0] != '*': #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap) + tripperson = re.sub(round_bracket_regex, "", tripperson).strip() personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - print "NoMatchFor: '%s'" % tripperson + print(" - No name match for: '%s'" % tripperson) + message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year) + models.DataIssue.objects.create(parser='logbooks', message=message) res.append((personyear, logtime_underground)) if mul: author = personyear @@ -45,7 +51,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground): author = res[-1][0] return res, author -def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... +def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function... # print "Getting cave for " , place try: katastNumRes=[] @@ -65,32 +71,34 @@ def GetTripCave(place): #need to be fuzzier about matching h return tripCaveRes elif len(tripCaveRes)>1: - print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes) + print("Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)) correctIndex=input("type list index of correct cave") return tripCaveRes[correctIndex] else: - print "No cave found for place " , place + print("No cave found for place " , place) return noncaveplaces = [ "Journey", "Loser Plateau" ] -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): """ saves a logbook entry and related persontrips """ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) if not author: - print "skipping logentry", title + print(" - Skipping logentry: " + title + " - no author for entry") + message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) + models.DataIssue.objects.create(parser='logbooks', message=message) return - -# tripCave = GetTripCave(place) - # + + #tripCave = GetTripCave(place) + lplace = place.lower() if lplace not in noncaveplaces: cave=GetCaveLookup().get(lplace) #Check for an existing copy of the current entry, and save expeditionday = expedition.get_expedition_day(date) - lookupAttribs={'date':date, 'title':title} - nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]} + lookupAttribs={'date':date, 'title':title} + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type} lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) for tripperson, time_underground in trippersons: @@ -102,8 +110,8 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ def ParseDate(tripdate, year): """ Interprets dates in the expo logbooks and returns a correct datetime.date object """ - mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) - mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) + mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate) + mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate) if mdatestandard: assert mdatestandard.group(1) == year, (tripdate, year) year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) @@ -115,9 +123,9 @@ def ParseDate(tripdate, year): assert False, tripdate return datetime.date(year, month, day) -# 2007, 2008, 2006 +# 2006, 2008 - 2010 def Parselogwikitxt(year, expedition, txt): - trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt) + trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: tripheadp = triphead.split("|") #print "ttt", tripheadp @@ -126,7 +134,7 @@ def Parselogwikitxt(year, expedition, txt): tripsplace = tripplace.split(" - ") tripcave = tripsplace[0].strip() - tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) + tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) if tul: #assert len(tul) <= 1, (triphead, triptext) #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) @@ -140,12 +148,16 @@ def Parselogwikitxt(year, expedition, txt): #print "\n", tripcave, "--- ppp", trippeople, len(triptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -# 2002, 2004, 2005 +# 2002, 2004, 2005, 2007, 2011 - 2018 def Parseloghtmltxt(year, expedition, txt): - tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + #print(" - Starting log html parser") + tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) + logbook_entry_count = 0 for trippara in tripparas: + #print(" - HR detected - maybe a trip?") + logbook_entry_count += 1 - s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date + s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)? \s*<div\s+class="trippeople">\s*(.*?)</div> @@ -155,38 +167,41 @@ def Parseloghtmltxt(year, expedition, txt): \s*$ ''', trippara) if not s: - if not re.search("Rigging Guide", trippara): - print "can't parse: ", trippara # this is 2007 which needs editing + if not re.search(r"Rigging Guide", trippara): + print("can't parse: ", trippara) # this is 2007 which needs editing #assert s, trippara continue - tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() ldate = ParseDate(tripdate.strip(), year) #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate) - trippeople = re.sub("Ol(?!l)", "Olly", trippeople) - trippeople = re.sub("Wook(?!e)", "Wookey", trippeople) + #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople) + #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople) triptitles = triptitle.split(" - ") if len(triptitles) >= 2: tripcave = triptitles[0] else: tripcave = "UNKNOWN" - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) - ltriptext = re.sub("</p>", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - - -# main parser for pre-2001. simpler because the data has been hacked so much to fit it + #print("\n", tripcave, "--- ppp", trippeople, len(triptext)) + ltriptext = re.sub(r"</p>", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip() + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") + if logbook_entry_count == 0: + print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") + + +# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): - tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) + tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) assert s, trippara[:300] tripheader, triptext = s.group(1), s.group(2) - mtripid = re.search('<a id="(.*?)"', tripheader) + mtripid = re.search(r'<a id="(.*?)"', tripheader) tripid = mtripid and mtripid.group(1) or "" - tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader) + tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) #print " ", [tripheader] #continue @@ -194,7 +209,7 @@ def Parseloghtml01(year, expedition, txt): tripdate, triptitle, trippeople = tripheader.split("|") ldate = ParseDate(tripdate.strip(), year) - mtu = re.search('<p[^>]*>(T/?U.*)', triptext) + mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) if mtu: tu = mtu.group(1) triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] @@ -206,38 +221,40 @@ def Parseloghtml01(year, expedition, txt): ltriptext = triptext - mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) + mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) if mtail: #print mtail.group(0) ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub("</p>", "", ltriptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"</p>", "", ltriptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip() #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) - ltriptext = re.sub("</?u>", "_", ltriptext) - ltriptext = re.sub("</?i>", "''", ltriptext) - ltriptext = re.sub("</?b>", "'''", ltriptext) + ltriptext = re.sub(r"</?u>", "_", ltriptext) + ltriptext = re.sub(r"</?i>", "''", ltriptext) + ltriptext = re.sub(r"</?b>", "'''", ltriptext) #print ldate, trippeople.strip() # could includ the tripid (url link for cross referencing) - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - + EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, + trippeople=trippeople, expedition=expedition, logtime_underground=0, + entry_type="html") +# parser for 2003 def Parseloghtml03(year, expedition, txt): - tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt) + tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) for trippara in tripparas: s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara) assert s, trippara tripheader, triptext = s.group(1), s.group(2) - tripheader = re.sub(" ", " ", tripheader) - tripheader = re.sub("\s+", " ", tripheader).strip() + tripheader = re.sub(r" ", " ", tripheader) + tripheader = re.sub(r"\s+", " ", tripheader).strip() sheader = tripheader.split(" -- ") tu = "" if re.match("T/U|Time underwater", sheader[-1]): tu = sheader.pop() if len(sheader) != 3: - print "header not three pieces", sheader + print("header not three pieces", sheader) tripdate, triptitle, trippeople = sheader ldate = ParseDate(tripdate.strip(), year) triptitles = triptitle.split(" , ") @@ -246,37 +263,14 @@ def Parseloghtml03(year, expedition, txt): else: tripcave = "UNKNOWN" #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) - ltriptext = re.sub("</p>", "", triptext) - ltriptext = re.sub("\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub("<p>", "\n\n", ltriptext).strip() - ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) - -yearlinks = [ -# ("2013", "2013/logbook.html", Parseloghtmltxt), - ("2012", "2012/logbook.html", Parseloghtmltxt), - ("2011", "2011/logbook.html", Parseloghtmltxt), - ("2010", "2010/logbook.html", Parselogwikitxt), - ("2009", "2009/2009logbook.txt", Parselogwikitxt), - ("2008", "2008/2008logbook.txt", Parselogwikitxt), - ("2007", "2007/logbook.html", Parseloghtmltxt), - ("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt), - ("2005", "2005/logbook.html", Parseloghtmltxt), - ("2004", "2004/logbook.html", Parseloghtmltxt), - ("2003", "2003/logbook.html", Parseloghtml03), - ("2002", "2002/logbook.html", Parseloghtmltxt), - ("2001", "2001/log.htm", Parseloghtml01), - ("2000", "2000/log.htm", Parseloghtml01), - ("1999", "1999/log.htm", Parseloghtml01), - ("1998", "1998/log.htm", Parseloghtml01), - ("1997", "1997/log.htm", Parseloghtml01), - ("1996", "1996/log.htm", Parseloghtml01), - ("1995", "1995/log.htm", Parseloghtml01), - ("1994", "1994/log.htm", Parseloghtml01), - ("1993", "1993/log.htm", Parseloghtml01), - ("1992", "1992/log.htm", Parseloghtml01), - ("1991", "1991/log.htm", Parseloghtml01), - ] + ltriptext = re.sub(r"</p>", "", triptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() + ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) + EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, + text = ltriptext, trippeople=trippeople, expedition=expedition, + logtime_underground=0, entry_type="html") + def SetDatesFromLogbookEntries(expedition): """ @@ -295,54 +289,67 @@ def SetDatesFromLogbookEntries(expedition): persontrip.persontrip_next = None lprevpersontrip = persontrip persontrip.save() - - - + + def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ - expowebbase = os.path.join(settings.EXPOWEB, "years") - year = str(expedition.year) - for lyear, lloc, parsefunc in yearlinks: - if lyear == year: - break - fin = open(os.path.join(expowebbase, lloc)) - print "opennning", lloc - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) - return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) + expowebbase = os.path.join(settings.EXPOWEB, "years") + yearlinks = settings.LOGBOOK_PARSER_SETTINGS + + logbook_parseable = False + + if expedition.year in yearlinks: + year_settings = yearlinks[expedition.year] + file_in = open(os.path.join(expowebbase, year_settings[0])) + txt = file_in.read().decode("latin1") + file_in.close() + parsefunc = year_settings[1] + logbook_parseable = True + print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]) + else: + try: + file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + print("No set parser found using default") + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + except (IOError): + logbook_parseable = False + print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year) + + if logbook_parseable: + parser = globals()[parsefunc] + parser(expedition.year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + + #return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) def LoadLogbooks(): - """ This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """ - - #Deletion has been moved to a seperate function to enable the non-destructive importing - #models.LogbookEntry.objects.all().delete() - expowebbase = os.path.join(settings.EXPOWEB, "years") - #yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite - #yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite - - for year, lloc, parsefunc in yearlinks: - # This will not work until the corresponding year exists in the database. - # In 2012 this needed noscript/folk.csv to be updated first. - expedition = models.Expedition.objects.filter(year = year)[0] - fin = open(os.path.join(expowebbase, lloc)) - txt = fin.read().decode("latin1") - fin.close() - parsefunc(year, expedition, txt) - SetDatesFromLogbookEntries(expedition) + """ This is the master function for parsing all logbooks into the Troggle database. """ + + # Clear the logbook data issues as we are reloading + models.DataIssue.objects.filter(parser='logbooks').delete() + # Fetch all expos + expos = models.Expedition.objects.all() + for expo in expos: + print("\nLoading Logbook for: " + expo.year) + + # Load logbook for expo + LoadLogbookForExpedition(expo) + -dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) -expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S) -titleRegex = re.compile('<H1>(.*?)</H1>', re.S) -reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S) -personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S) -nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S) -TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) -locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S) -caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S) +dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) +expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) +titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S) +reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S) +personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S) +nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S) +TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S) +locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S) +caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S) def parseAutoLogBookEntry(filename): errors = [] @@ -435,4 +442,4 @@ def parseAutoLogBookEntry(filename): time_underground = TU, logbook_entry = logbookEntry, is_logbook_entry_author = author).save() - print logbookEntry + print(logbookEntry) diff --git a/parsers/people.py b/parsers/people.py index 4dba3a8..34a5ff3 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -4,6 +4,8 @@ from django.conf import settings import troggle.core.models as models import csv, re, datetime, os, shutil from utils import save_carefully +from HTMLParser import HTMLParser +from unidecode import unidecode def saveMugShot(mugShotPath, mugShotFilename, person): if mugShotFilename.startswith(r'i/'): #if filename in cell has the directory attached (I think they all do), remove it @@ -44,13 +46,13 @@ def parseMugShotAndBlurb(personline, header, person): def LoadPersonsExpos(): - persontab = open(os.path.join(settings.EXPOWEB, "noinfo", "folk.csv")) + persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) personreader = csv.reader(persontab) headers = personreader.next() header = dict(zip(headers, range(len(headers)))) # make expeditions - print "Loading expeditions" + print("Loading expeditions") years = headers[5:] for year in years: @@ -59,20 +61,35 @@ def LoadPersonsExpos(): save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs) - # make persons - print "Loading personexpeditions" + print("Loading personexpeditions") for personline in personreader: name = personline[header["Name"]] - name = re.sub("<.*?>", "", name) - mname = re.match("(\w+)(?:\s((?:van |ten )?\w+))?(?:\s\(([^)]*)\))?", name) - nickname = mname.group(3) or "" - - lookupAttribs={'first_name':mname.group(1), 'last_name':(mname.group(2) or "")} - nonLookupAttribs={'is_vfho':personline[header["VfHO member"]],} + name = re.sub(r"<.*?>", "", name) + + firstname = "" + nickname = "" + + rawlastname = personline[header["Lastname"]].strip() + matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname) + lastname = matchlastname.group(1).strip() + + splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name) + fullname = splitnick.group(1) + + nickname = splitnick.group(2) or "" + + fullname = fullname.strip() + names = fullname.split(' ') + firstname = names[0] + if len(names) == 1: + lastname = "" + + lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")} + nonLookupAttribs={'is_vfho':personline[header["VfHO member"]], 'fullname':fullname} person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs) - + parseMugShotAndBlurb(personline=personline, header=header, person=person) # make person expedition from table @@ -83,7 +100,26 @@ def LoadPersonsExpos(): nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")} save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs) - + + # this fills in those people for whom 2008 was their first expo + #print "Loading personexpeditions 2008" + #expoers2008 = """Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis""".split(",") + #expomissing = set(expoers2008) + #for name in expomissing: + # firstname, lastname = name.split() + # is_guest = name in ["Eeva Makiranta", "Keith Curtis"] + # print "2008:", name + # persons = list(models.Person.objects.filter(first_name=firstname, last_name=lastname)) + # if not persons: + # person = models.Person(first_name=firstname, last_name = lastname, is_vfho = False, mug_shot = "") + # #person.Sethref() + # person.save() + # else: + # person = persons[0] + # expedition = models.Expedition.objects.get(year="2008") + # personexpedition = models.PersonExpedition(person=person, expedition=expedition, nickname="", is_guest=is_guest) + # personexpedition.save() + # used in other referencing parser functions # expedition name lookup cached for speed (it's a very big list) Gpersonexpeditionnamelookup = { } @@ -96,20 +132,33 @@ def GetPersonExpeditionNameLookup(expedition): res = { } duplicates = set() - print "Calculating GetPersonExpeditionNameLookup for", expedition.year + print("Calculating GetPersonExpeditionNameLookup for " + expedition.year) personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition) + htmlparser = HTMLParser() for personexpedition in personexpeditions: possnames = [ ] - f = personexpedition.person.first_name.lower() - l = personexpedition.person.last_name.lower() + f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower())) + l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower())) + full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower())) if l: possnames.append(f + " " + l) possnames.append(f + " " + l[0]) possnames.append(f + l[0]) possnames.append(f[0] + " " + l) possnames.append(f) - if personexpedition.nickname: + if full not in possnames: + possnames.append(full) + if personexpedition.nickname not in possnames: possnames.append(personexpedition.nickname.lower()) + if l: + # This allows for nickname to be used for short name eg Phil + # adding Phil Sargent to the list + if str(personexpedition.nickname.lower() + " " + l) not in possnames: + possnames.append(personexpedition.nickname.lower() + " " + l) + if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames: + possnames.append(personexpedition.nickname.lower() + " " + l[0]) + if str(personexpedition.nickname.lower() + l[0]) not in possnames: + possnames.append(personexpedition.nickname.lower() + l[0]) for possname in possnames: if possname in res: diff --git a/parsers/peopleM.py b/parsers/peopleM.py deleted file mode 100644 index 62c7ce0..0000000 --- a/parsers/peopleM.py +++ /dev/null @@ -1,27 +0,0 @@ -from django.conf import settings -import troggle.core.models as models - -def load(): - folkfile = open(settings.EXPOWEB+"noinfo/folk.csv") - personlines = folkfile.read().splitlines() - persontable = [x.split(',') for x in personlines] - years = [persontable[0][i] for i in range(5,len(persontable[0]))] - for year in years: - newexpedition = models.ExpeditionM( date = year ) - newexpedition.save() - for row in persontable[1:]: #skip header - attendedid = [i for i, x in enumerate(row) if '1' in x] - attendedyears = [persontable[0][i] for i in attendedid if i >= 5] - name = row[0] - print(name+' has attended: '+', '.join(attendedyears)) - newperson = models.PersonM( - name = name) - newperson.save() - for year in attendedyears: - target = models.ExpeditionM.objects.get(date=year) - newperson.expos_attended.add( target ) - print('Person -> Expo table created!') - - - - diff --git a/parsers/survex.py b/parsers/survex.py index 536314f..14bd035 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -5,20 +5,26 @@ import troggle.settings as settings from subprocess import call, Popen, PIPE from troggle.parsers.people import GetPersonExpeditionNameLookup +from django.utils.timezone import get_current_timezone +from django.utils.timezone import make_aware + import re import os +from datetime import datetime +line_leg_regex = re.compile(r"[\d\-+.]+$") -def LoadSurvexLineLeg(survexblock, stardata, sline, comment): +def LoadSurvexLineLeg(survexblock, stardata, sline, comment, cave): + # The try catches here need replacing as they are relativly expensive ls = sline.lower().split() ssfrom = survexblock.MakeSurvexStation(ls[stardata["from"]]) ssto = survexblock.MakeSurvexStation(ls[stardata["to"]]) - + survexleg = models.SurvexLeg(block=survexblock, stationfrom=ssfrom, stationto=ssto) if stardata["type"] == "normal": try: survexleg.tape = float(ls[stardata["tape"]]) - except ValueError: + except ValueError: print("Tape misread in", survexblock.survexfile.path) print("Stardata:", stardata) print("Line:", ls) @@ -53,14 +59,17 @@ def LoadSurvexLineLeg(survexblock, stardata, sline, comment): survexleg.compass = 1000 survexleg.clino = -90.0 else: - assert re.match(r"[\d\-+.]+$", lcompass), ls - assert re.match(r"[\d\-+.]+$", lclino) and lclino != "-", ls + assert line_leg_regex.match(lcompass), ls + assert line_leg_regex.match(lclino) and lclino != "-", ls survexleg.compass = float(lcompass) survexleg.clino = float(lclino) - + + if cave: + survexleg.cave = cave + # only save proper legs survexleg.save() - + itape = stardata.get("tape") if itape: try: @@ -80,96 +89,212 @@ def LoadSurvexEquate(survexblock, sline): def LoadSurvexLinePassage(survexblock, stardata, sline, comment): pass - stardatadefault = {"type":"normal", "t":"leg", "from":0, "to":1, "tape":2, "compass":3, "clino":4} stardataparamconvert = {"length":"tape", "bearing":"compass", "gradient":"clino"} +regex_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$") +regex_ref = re.compile(r'.*?ref.*?(\d+)\s*#\s*(\d+)') +regex_star = re.compile(r'\s*\*[\s,]*(\w+)\s*(.*?)\s*(?:;.*)?$') +regex_team = re.compile(r"(Insts|Notes|Tape|Dog|Useless|Pics|Helper|Disto|Consultant)\s+(.*)$(?i)") +regex_team_member = re.compile(r" and | / |, | & | \+ |^both$|^none$(?i)") +regex_qm = re.compile(r'^\s*QM(\d)\s+?([a-dA-DxX])\s+([\w\-]+)\.(\d+)\s+(([\w\-]+)\.(\d+)|\-)\s+(.+)$') + def RecursiveLoad(survexblock, survexfile, fin, textlines): iblankbegins = 0 text = [ ] stardata = stardatadefault teammembers = [ ] - -# uncomment to print out all files during parsing - print("Reading file:", survexblock.survexfile.path) - while True: - svxline = fin.readline().decode("latin1") - if not svxline: - return - textlines.append(svxline) - + + # uncomment to print out all files during parsing + print(" - Reading file: " + survexblock.survexfile.path) + stamp = datetime.now() + lineno = 0 + + # Try to find the cave in the DB if not use the string as before + path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", survexblock.survexfile.path) + if path_match: + pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2)) + # print('Match') + # print(pos_cave) + cave = models.getCaveByReference(pos_cave) + if cave: + survexfile.cave = cave + svxlines = '' + svxlines = fin.read().splitlines() + # print('Cave - preloop ' + str(survexfile.cave)) + # print(survexblock) + for svxline in svxlines: + + # print(survexblock) + + # print(svxline) + # if not svxline: + # print(' - Not survex') + # return + # textlines.append(svxline) + + lineno += 1 + + # print(' - Line: %d' % lineno) + # break the line at the comment - sline, comment = re.match(r"([^;]*?)\s*(?:;\s*(.*))?\n?$", svxline.strip()).groups() - + sline, comment = regex_comment.match(svxline.strip()).groups() # detect ref line pointing to the scans directory - mref = comment and re.match(r'.*?ref.*?(\d+)\s*#\s*(\d+)', comment) + mref = comment and regex_ref.match(comment) if mref: refscan = "%s#%s" % (mref.group(1), mref.group(2)) survexscansfolders = models.SurvexScansFolder.objects.filter(walletname=refscan) if survexscansfolders: survexblock.survexscansfolder = survexscansfolders[0] #survexblock.refscandir = "%s/%s%%23%s" % (mref.group(1), mref.group(1), mref.group(2)) - survexblock.save() + survexblock.save() continue - + + # This whole section should be moved if we can have *QM become a proper survex command + # Spec of QM in SVX files, currently commented out need to add to survex + # needs to match regex_qm + # ;Serial number grade(A/B/C/D/X) nearest-station resolution-station description + # ;QM1 a hobnob_hallway_2.42 hobnob-hallway_3.42 junction of keyhole passage + # ;QM1 a hobnob_hallway_2.42 - junction of keyhole passage + qmline = comment and regex_qm.match(comment) + if qmline: + print(qmline.groups()) + #(u'1', u'B', u'miraclemaze', u'1.17', u'-', None, u'\tcontinuation of rift') + qm_no = qmline.group(1) + qm_grade = qmline.group(2) + qm_from_section = qmline.group(3) + qm_from_station = qmline.group(4) + qm_resolve_section = qmline.group(6) + qm_resolve_station = qmline.group(7) + qm_notes = qmline.group(8) + + print('Cave - %s' % survexfile.cave) + print('QM no %d' % int(qm_no)) + print('QM grade %s' % qm_grade) + print('QM section %s' % qm_from_section) + print('QM station %s' % qm_from_station) + print('QM res section %s' % qm_resolve_section) + print('QM res station %s' % qm_resolve_station) + print('QM notes %s' % qm_notes) + + # If the QM isn't resolved (has a resolving station) thn load it + if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None': + from_section = models.SurvexBlock.objects.filter(name=qm_from_section) + # If we can find a section (survex note chunck, named) + if len(from_section) > 0: + print(from_section[0]) + from_station = models.SurvexStation.objects.filter(block=from_section[0], name=qm_from_station) + # If we can find a from station then we have the nearest station and can import it + if len(from_station) > 0: + print(from_station[0]) + qm = models.QM.objects.create(number=qm_no, + nearest_station=from_station[0], + grade=qm_grade.upper(), + location_description=qm_notes) + else: + print('QM found but resolved') + + #print('Cave -sline ' + str(cave)) if not sline: continue - + # detect the star command - mstar = re.match(r'\s*\*[\s,]*(\w+)\s*(.*?)\s*(?:;.*)?$', sline) + mstar = regex_star.match(sline) if not mstar: if "from" in stardata: - LoadSurvexLineLeg(survexblock, stardata, sline, comment) + # print('Cave ' + str(survexfile.cave)) + # print(survexblock) + LoadSurvexLineLeg(survexblock, stardata, sline, comment, survexfile.cave) + # print(' - From: ') + #print(stardata) + pass elif stardata["type"] == "passage": LoadSurvexLinePassage(survexblock, stardata, sline, comment) + # print(' - Passage: ') #Missing "station" in stardata. continue - + # detect the star command cmd, line = mstar.groups() cmd = cmd.lower() if re.match("include$(?i)", cmd): includepath = os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", line)) - includesurvexfile = models.SurvexFile(path=includepath, cave=survexfile.cave) + print(' - Include file found including - ' + includepath) + # Try to find the cave in the DB if not use the string as before + path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath) + if path_match: + pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2)) + # print(pos_cave) + cave = models.getCaveByReference(pos_cave) + if cave: + survexfile.cave = cave + else: + print('No match for %s' % includepath) + includesurvexfile = models.SurvexFile(path=includepath) includesurvexfile.save() includesurvexfile.SetDirectory() if includesurvexfile.exists(): + survexblock.save() fininclude = includesurvexfile.OpenFile() RecursiveLoad(survexblock, includesurvexfile, fininclude, textlines) - + elif re.match("begin$(?i)", cmd): - if line: + if line: + newsvxpath = os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", line)) + # Try to find the cave in the DB if not use the string as before + path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", newsvxpath) + if path_match: + pos_cave = '%s-%s' % (path_match.group(1), path_match.group(2)) + print(pos_cave) + cave = models.getCaveByReference(pos_cave) + if cave: + survexfile.cave = cave + else: + print('No match for %s' % newsvxpath) + name = line.lower() - survexblockdown = models.SurvexBlock(name=name, begin_char=fin.tell(), parent=survexblock, survexpath=survexblock.survexpath+"."+name, cave=survexblock.cave, survexfile=survexfile, totalleglength=0.0) + print(' - Begin found for: ' + name) + # print('Block cave: ' + str(survexfile.cave)) + survexblockdown = models.SurvexBlock(name=name, begin_char=fin.tell(), parent=survexblock, survexpath=survexblock.survexpath+"."+name, cave=survexfile.cave, survexfile=survexfile, totalleglength=0.0) survexblockdown.save() + survexblock.save() + survexblock = survexblockdown + # print(survexblockdown) textlinesdown = [ ] RecursiveLoad(survexblockdown, survexfile, fin, textlinesdown) else: iblankbegins += 1 - + elif re.match("end$(?i)", cmd): if iblankbegins: iblankbegins -= 1 else: survexblock.text = "".join(textlines) survexblock.save() + # print(' - End found: ') + endstamp = datetime.now() + timetaken = endstamp - stamp + # print(' - Time to process: ' + str(timetaken)) return - + elif re.match("date$(?i)", cmd): if len(line) == 10: - survexblock.date = re.sub(r"\.", "-", line) + #print(' - Date found: ' + line) + survexblock.date = make_aware(datetime.strptime(re.sub(r"\.", "-", line), '%Y-%m-%d'), get_current_timezone()) expeditions = models.Expedition.objects.filter(year=line[:4]) if expeditions: assert len(expeditions) == 1 survexblock.expedition = expeditions[0] survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date) survexblock.save() - + elif re.match("team$(?i)", cmd): - mteammember = re.match(r"(Insts|Notes|Tape|Dog|Useless|Pics|Helper|Disto|Consultant)\s+(.*)$(?i)", line) + pass + # print(' - Team found: ') + mteammember = regex_team.match(line) if mteammember: - for tm in re.split(r" and | / |, | & | \+ |^both$|^none$(?i)", mteammember.group(2)): + for tm in regex_team_member.split(mteammember.group(2)): if tm: personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower()) if (personexpedition, tm) not in teammembers: @@ -179,18 +304,23 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines): if personexpedition: personrole.person=personexpedition.person personrole.save() - + elif cmd == "title": - survextitle = models.SurvexTitle(survexblock=survexblock, title=line.strip('"'), cave=survexblock.cave) + #print(' - Title found: ') + survextitle = models.SurvexTitle(survexblock=survexblock, title=line.strip('"'), cave=survexfile.cave) survextitle.save() - + pass + elif cmd == "require": # should we check survex version available for processing? pass elif cmd == "data": + #print(' - Data found: ') ls = line.lower().split() stardata = { "type":ls[0] } + #print(' - Star data: ', stardata) + #print(ls) for i in range(0, len(ls)): stardata[stardataparamconvert.get(ls[i], ls[i])] = i - 1 if ls[0] in ["normal", "cartesian", "nosurvey"]: @@ -199,40 +329,23 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines): stardata = stardatadefault else: assert ls[0] == "passage", line - + elif cmd == "equate": + #print(' - Equate found: ') LoadSurvexEquate(survexblock, line) elif cmd == "fix": + #print(' - Fix found: ') survexblock.MakeSurvexStation(line.split()[0]) else: + #print(' - Stuff') if cmd not in ["sd", "include", "units", "entrance", "data", "flags", "title", "export", "instrument", "calibrate", "set", "infer", "alias", "ref", "cs", "declination", "case"]: print("Unrecognised command in line:", cmd, line, survexblock, survexblock.survexfile.path) - - -def ReloadSurvexCave(survex_cave, area): - print(survex_cave, area) - cave = models.Cave.objects.get(kataster_number=survex_cave, area__short_name=area) - print(cave) - #cave = models.Cave.objects.get(kataster_number=survex_cave) - cave.survexblock_set.all().delete() - cave.survexfile_set.all().delete() - cave.survexdirectory_set.all().delete() - - survexfile = models.SurvexFile(path="caves-" + cave.kat_area() + "/" + survex_cave + "/" + survex_cave, cave=cave) - survexfile.save() - survexfile.SetDirectory() - - survexblockroot = models.SurvexBlock(name="root", survexpath="caves-" + cave.kat_area(), begin_char=0, cave=cave, survexfile=survexfile, totalleglength=0.0) - survexblockroot.save() - fin = survexfile.OpenFile() - textlines = [ ] - RecursiveLoad(survexblockroot, survexfile, fin, textlines) - survexblockroot.text = "".join(textlines) - survexblockroot.save() - + endstamp = datetime.now() + timetaken = endstamp - stamp + # print(' - Time to process: ' + str(timetaken)) def LoadAllSurvexBlocks(): @@ -249,7 +362,7 @@ def LoadAllSurvexBlocks(): print(" - Data flushed") - survexfile = models.SurvexFile(path="all", cave=None) + survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None) survexfile.save() survexfile.SetDirectory() @@ -258,22 +371,13 @@ def LoadAllSurvexBlocks(): survexblockroot.save() fin = survexfile.OpenFile() textlines = [ ] + # The real work starts here RecursiveLoad(survexblockroot, survexfile, fin, textlines) + fin.close() survexblockroot.text = "".join(textlines) survexblockroot.save() - #Load each cave, - #FIXME this should be dealt with load all above - print(" - Reloading all caves") - caves = models.Cave.objects.all() - for cave in caves: - if cave.kataster_number and os.path.isdir(os.path.join(settings.SURVEX_DATA, "caves-" + cave.kat_area(), cave.kataster_number)): - if cave.kataster_number not in ['40']: - print("loading", cave, cave.kat_area()) - ReloadSurvexCave(cave.kataster_number, cave.kat_area()) - - poslineregex = re.compile(r"^\(\s*([+-]?\d*\.\d*),\s*([+-]?\d*\.\d*),\s*([+-]?\d*\.\d*)\s*\)\s*([^\s]+)$") @@ -281,12 +385,12 @@ def LoadPos(): print('Loading Pos....') - call([settings.CAVERN, "--output=%s/all.3d" % settings.SURVEX_DATA, "%s/all.svx" % settings.SURVEX_DATA]) - call([settings.THREEDTOPOS, '%sall.3d' % settings.SURVEX_DATA], cwd = settings.SURVEX_DATA) - posfile = open("%sall.pos" % settings.SURVEX_DATA) + call([settings.CAVERN, "--output=%s%s.3d" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME), "%s%s.svx" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME)]) + call([settings.THREEDTOPOS, '%s%s.3d' % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME)], cwd = settings.SURVEX_DATA) + posfile = open("%s%s.pos" % (settings.SURVEX_DATA, settings.SURVEX_TOPNAME)) posfile.readline() #Drop header for line in posfile.readlines(): - r = poslineregex.match(line) + r = poslineregex.match(line) if r: x, y, z, name = r.groups() try: diff --git a/parsers/surveys.py b/parsers/surveys.py index 72a05f3..efab536 100644 --- a/parsers/surveys.py +++ b/parsers/surveys.py @@ -99,7 +99,7 @@ def parseSurveyScans(expedition, logfile=None): #scanList = listdir(expedition.year, surveyFolder) scanList=os.listdir(os.path.join(yearPath,surveyFolder)) except AttributeError: - print(surveyFolder + " ignored\r",) + print("Folder: " + surveyFolder + " ignored\r") continue for scan in scanList: @@ -107,7 +107,7 @@ def parseSurveyScans(expedition, logfile=None): scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups() scanType,scanNumber,scanFormat=scanChopped except AttributeError: - print(scan + " ignored\r",) + print("File: " + scan + " ignored\r") continue if scanType == 'elev' or scanType == 'extend': scanType = 'elevation' @@ -174,9 +174,6 @@ def GetListDir(sdir): ff = os.path.join(sdir, f) res.append((f, ff, os.path.isdir(ff))) return res - - - def LoadListScansFile(survexscansfolder): diff --git a/parsers/surveysM.py b/parsers/surveysM.py deleted file mode 100644 index 2b94b02..0000000 --- a/parsers/surveysM.py +++ /dev/null @@ -1,65 +0,0 @@ -from django.conf import settings -import subprocess, re -import troggle.core.models as models - -def load(): - print('Load survex files and relations') - load_area('1623') - -def load_area(areacode): - - print('Searching all cave dirs files') - basedir = settings.SURVEX_DATA+'caves-'+areacode+'/' - - cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories - print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')') - - for cavedir in cavedirs: - if cavedir==basedir: - continue #skip the basedir - a non-proper subdirectory - parentname = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory - parentcave = models.CaveM.objects.filter(survex_file__icontains=cavedir) - if len(parentcave)>1: - print('Non unique parent - skipping. Name:'+parentname) - elif len(parentcave)==0: - print('Error! parent not created:'+parentname) - continue - else: #exaclty one match - print('Adding relations of:'+parentname) - parentcave = parentcave[0] - - surveyfiles = bash('find '+cavedir+' -name \'*.svx\'').splitlines() - for fn in surveyfiles: - print(fn) - svxcontents = open(fn,'r').read().splitlines() - try: - dateline = [x for x in svxcontents if ('*date' in x)][0] - date = re.findall('\\d\\d\\d\\d\\.\\d\\d\\.\\d\\d', dateline, re.S)[0] - - - except: - if( len( [x for x in svxcontents if ('*date' in x)] ) == 0 ): - continue #skip dateless files - print('Date format error in '+fn) - print('Dateline = '+ '"'.join([x for x in svxcontents if ('*date' in x)])) - date = '1900.01.01' - - - newsurvex = models.SurveyM(survex_file=fn, date=date) - newsurvex.save() - parentcave.surveys.add(newsurvex) - parentcave.save() - - -def file_exists(filename): - test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence - if 'MISSING' in test: #send error message to the database - return False - return True - -def bash(cmd): #calls command in bash shell, returns output - process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) - output, error = process.communicate() - return output - - |