diff options
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/caves.py | 9 | ||||
-rw-r--r-- | parsers/imports.py | 12 | ||||
-rw-r--r-- | parsers/logbooks.py | 3 | ||||
-rw-r--r-- | parsers/survex.py | 90 |
4 files changed, 65 insertions, 49 deletions
diff --git a/parsers/caves.py b/parsers/caves.py index d1e7406..2bb2ccc 100644 --- a/parsers/caves.py +++ b/parsers/caves.py @@ -192,6 +192,8 @@ def readcave(filename): url = url[0], filename = filename) except: + # this slow db query happens on every cave, but on import we have all this in memory + # and don't need to do a db query. Fix this to speed it up! # need to cope with duplicates print(" ! FAILED to get only one CAVE when updating using: "+filename) kaves = models_caves.Cave.objects.all().filter(kataster_number=kataster_number[0]) @@ -206,6 +208,8 @@ def readcave(filename): c = k for area_slug in areas: + # this slow db query happens on every cave, but on import we have all this in memory + # and don't need to do a db query. Fix this to speed it up! area = models_caves.Area.objects.filter(short_name = area_slug) if area: newArea = area[0] @@ -216,6 +220,8 @@ def readcave(filename): primary = True for slug in slugs: try: + # this slow db query happens on every cave, but on import we have all this in memory + # and don't need to do a db query. Fix this to speed it up! cs = models_caves.CaveSlug.objects.update_or_create(cave = c, slug = slug, primary = primary) @@ -225,10 +231,13 @@ def readcave(filename): print(message) primary = False + for entrance in entrances: slug = getXML(entrance, "entranceslug", maxItems = 1, context = context)[0] letter = getXML(entrance, "letter", maxItems = 1, context = context)[0] try: + # this slow db query happens on every entrance, but on import we have all this in memory + # and don't need to do a db query. Fix this to speed it up! entrance = models_caves.Entrance.objects.get(entranceslug__slug = slug) ce = models_caves.CaveAndEntrance.objects.update_or_create(cave = c, entrance_letter = letter, entrance = entrance) except: diff --git a/parsers/imports.py b/parsers/imports.py index c2965c2..f8c98a3 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -16,21 +16,21 @@ import troggle.parsers.logbooks import troggle.parsers.QMs def import_caves(): - print("Importing Caves to ",end="") + print("-- Importing Caves to ",end="") print(django.db.connections.databases['default']['NAME']) troggle.parsers.caves.readcaves() def import_people(): - print("Importing People (folk.csv) to ",end="") + print("-- Importing People (folk.csv) to ",end="") print(django.db.connections.databases['default']['NAME']) troggle.parsers.people.LoadPersonsExpos() def import_surveyscans(): - print("Importing Survey Scans") + print("-- Importing Survey Scans") troggle.parsers.surveys.LoadListScans() def import_logbooks(): - print("Importing Logbooks") + print("-- Importing Logbooks") troggle.parsers.logbooks.LoadLogbooks() def import_QMs(): @@ -40,7 +40,7 @@ def import_QMs(): def import_survex(): # when this import is moved to the top with the rest it all crashes horribly import troggle.parsers.survex - print("Importing Survex Blocks") + print("-- Importing Survex Blocks") print(" - Survex Blocks") troggle.parsers.survex.LoadSurvexBlocks() print(" - Survex entrances x/y/z Positions") @@ -53,6 +53,6 @@ def import_loadpos(): troggle.parsers.survex.LoadPos() def import_drawingsfiles(): - print("Importing Drawings files") + print("-- Importing Drawings files") troggle.parsers.surveys.LoadDrawingFiles() diff --git a/parsers/logbooks.py b/parsers/logbooks.py index c4f2c9c..b18e839 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -114,6 +114,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ expeditionday = expedition.get_expedition_day(date) lookupAttribs={'date':date, 'title':title} # 'cave' is converted to a string doing this, which renders as the cave slug. + # but it is a db query which we should try to avoid - rewrite this nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type} lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) @@ -356,6 +357,8 @@ def SetDatesFromLogbookEntries(expedition): Sets the date_from and date_to field for an expedition based on persontrips. Then sets the expedition date_from and date_to based on the personexpeditions. """ + # Probably a faster way to do this. This uses a lot of db queries, but we have all this + # in memory.. for personexpedition in expedition.personexpedition_set.all(): persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date') # sequencing is difficult to do diff --git a/parsers/survex.py b/parsers/survex.py index da0395d..7db8af0 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -324,7 +324,8 @@ class LoadingSurvex(): return self.caveslist[g] print(' ! Failed to find cave for {}'.format(cavepath.lower())) else: - print(' ! No regex cave match for %s' % cavepath.lower()) + # not a cave, but that is fine. + # print(' ! No regex(standard identifier) cave match for %s' % cavepath.lower()) return None def GetSurvexDirectory(self, headpath): @@ -353,17 +354,17 @@ class LoadingSurvex(): print("\n"+message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) - def LoadSurvexFile(self, includelabel): + def LoadSurvexFile(self, svxid): """Creates SurvexFile in the database, and SurvexDirectory if needed with links to 'cave' - Creates a new current survexblock with valid .survexfile and valid .survexdirectory + Creates a new current survexfile and valid .survexdirectory The survexblock passed-in is not necessarily the parent. FIX THIS. """ depth = " " * self.depthbegin - print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, includelabel)) - headpath, tail = os.path.split(includelabel) + print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid)) + headpath = os.path.dirname(svxid) - newfile = models_survex.SurvexFile(path=includelabel) + newfile = models_survex.SurvexFile(path=svxid) newfile.save() # until we do this there is no internal id so no foreign key works self.currentsurvexfile = newfile newdirectory = self.GetSurvexDirectory(headpath) @@ -383,10 +384,10 @@ class LoadingSurvex(): newfile.cave = cave #print("\n"+str(newdirectory.cave),file=sys.stderr) else: - self.ReportNonCaveIncludes(headpath, includelabel) + self.ReportNonCaveIncludes(headpath, svxid) if not newfile.survexdirectory: - message = " ! SurvexDirectory NOT SET in new SurvexFile {} ".format(includelabel) + message = " ! SurvexDirectory NOT SET in new SurvexFile {} ".format(svxid) print(message) print(message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) @@ -401,7 +402,7 @@ class LoadingSurvex(): def ProcessIncludeLine(self, included): svxid = included.groups()[0] #depth = " " * self.depthbegin - #print("{:2}{} - Include survexfile:'{}'".format(self.depthbegin, depth, svxid)) + #print("{:2}{} - Include survexfile:'{}' {}".format(self.depthbegin, depth, svxid, included)) self.LoadSurvexFile(svxid) self.stacksvxfiles.append(self.currentsurvexfile) @@ -426,8 +427,10 @@ class LoadingSurvex(): self.LoadSurvexQM(survexblock, qmline) included = self.rx_comminc.match(comment) - # ;*include means we have been included; not 'proceed to include' which *include means + # ;*include means 'we have been included'; whereas *include means 'proceed to include' if included: + #depth = " " * self.depthbegin + #print("{:2}{} - Include comment:'{}' {}".format(self.depthbegin, depth, comment, included)) self.ProcessIncludeLine(included) edulcni = self.rx_commcni.match(comment) @@ -457,7 +460,7 @@ class LoadingSurvex(): def LinearLoad(self, survexblock, path, svxlines): """Loads a single survex file. Usually used to import all the survex files which have been collated - into a single file. Loads the begin/end blocks recursively. + into a single file. Loads the begin/end blocks using a stack for labels. """ self.relativefilename = path cave = self.IdentifyCave(path) # this will produce null for survex files which are geographic collections @@ -466,19 +469,25 @@ class LoadingSurvex(): self.currentsurvexfile.save() # django insists on this although it is already saved !? blockcount = 0 + lineno = 0 def tickle(): nonlocal blockcount blockcount +=1 if blockcount % 10 ==0 : print(".", file=sys.stderr,end='') - if blockcount % 500 ==0 : + if blockcount % 200 ==0 : print("\n", file=sys.stderr,end='') - sys.stderr.flush(); + print(" - MEM:{:7.3f} MB in use".format(models.get_process_memory()),file=sys.stderr) + sys.stderr.flush() for svxline in svxlines: - sline, comment = self.rx_comment.match(svxline.strip()).groups() + lineno += 1 + sline, comment = self.rx_comment.match(svxline).groups() if comment: + depth = " " * self.depthbegin + print("{:4} {:2}{} - Include comment:'{}' {}".format(lineno, self.depthbegin, depth, comment, sline)) self.LoadSurvexComment(survexblock, comment) # this catches the ;*include and ;*edulcni lines too + if not sline: continue # skip blank lines @@ -503,10 +512,10 @@ class LoadingSurvex(): pathlist += "." + id newsurvexblock = models_survex.SurvexBlock(name=blockid, parent=survexblock, survexpath=pathlist, - title = survexblock.title, # copy parent inititally cave=self.currentcave, survexfile=self.currentsurvexfile, legsall=0, legssplay=0, legssurfc=0, totalleglength=0.0) newsurvexblock.save() + newsurvexblock.title = "("+survexblock.title+")" # copy parent inititally survexblock = newsurvexblock # survexblock.survexfile.save() survexblock.save() # django insists on this , but we want to save at the end ! @@ -564,7 +573,7 @@ class LoadingSurvex(): else: pass # ignore all other sorts of data - def RecursiveScan(self, survexblock, survexfile, fin, flinear, fcollate): + def RecursiveScan(self, survexblock, path, fin, flinear, fcollate): """Follows the *include links in all the survex files from the root file 1623.svx and reads only the *include and *begin and *end statements. It produces a linearised list of the include tree @@ -577,27 +586,27 @@ class LoadingSurvex(): if self.callcount % 500 ==0 : print("\n", file=sys.stderr,end='') - if survexfile in self.svxfileslist: - message = " * Warning. Survex file already seen: {}".format(survexfile.path) + if path in self.svxfileslist: + message = " * Warning. Duplicate in *include list at:{} depth:{} file:{}".format(self.callcount, self.depthinclude, path) print(message) print(message,file=flinear) - print(message,file=sys.stderr) + print("\n"+message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) - if self.svxfileslist.count(survexfile) > 20: - message = " ! ERROR. Survex file already seen 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(survexfile.path) + if self.svxfileslist.count(path) > 20: + message = " ! ERROR. Survex file already seen 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path) print(message) print(message,file=flinear) print(message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) return - self.svxfileslist.append(survexfile) + self.svxfileslist.append(path) svxlines = fin.read().splitlines() for svxline in svxlines: self.lineno += 1 includestmt =self.rx_include.match(svxline) if not includestmt: - fcollate.write("{}\n".format(svxline)) + fcollate.write("{}\n".format(svxline.strip())) sline, comment = self.rx_comment.match(svxline.strip()).groups() mstar = self.rx_star.match(sline) @@ -605,40 +614,35 @@ class LoadingSurvex(): cmd, args = mstar.groups() cmd = cmd.lower() if re.match("(?i)include$", cmd): - includepath = os.path.normpath(os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", args))) - path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath) - - includesurvexfile = models_survex.SurvexFile(path=includepath) - includesurvexfile.save() + includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) + #path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath) - if includesurvexfile.exists(): - # do not create SurvexFile in DB here by doing includesurvexfile.save(). Do it when reading data. + fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx") + if os.path.isfile(fullpath): #-------------------------------------------------------- self.depthinclude += 1 - fininclude = includesurvexfile.OpenFile() - fcollate.write(";*include {}\n".format(includesurvexfile.path)) - flinear.write("{:2} {} *include {}\n".format(self.depthinclude, indent, includesurvexfile.path)) - push = includesurvexfile.path.lower() + fininclude = open(fullpath,'r') + fcollate.write(";*include {}\n".format(includepath)) + flinear.write("{:2} {} *include {}\n".format(self.depthinclude, indent, includepath)) + push = includepath.lower() self.stackinclude.append(push) #----------------- - self.RecursiveScan(survexblock, includesurvexfile, fininclude, flinear, fcollate) + self.RecursiveScan(survexblock, includepath, fininclude, flinear, fcollate) #----------------- pop = self.stackinclude.pop() if pop != push: - message = "!!!!!!! ERROR pop != push {} != {} {}".format(pop, push, self.stackinclude) + message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.stackinclude) print(message) print(message,file=flinear) print(message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) - includesurvexfile.path += "-TEMP" - includesurvexfile = None flinear.write("{:2} {} *edulcni {}\n".format(self.depthinclude, indent, pop)) fcollate.write(";*edulcni {}\n".format(pop)) fininclude.close() self.depthinclude -= 1 #-------------------------------------------------------- else: - message = " ! ERROR *include file not found for [{}]:'{}'".format(includesurvexfile, includepath) + message = " ! ERROR *include file not found for:'{}'".format(includepath) print(message) print(message,file=sys.stderr) models.DataIssue.objects.create(parser='survex', message=message) @@ -659,7 +663,7 @@ class LoadingSurvex(): args = " " popargs = self.stackbegin.pop() if popargs != args.lower(): - message = "!!!!!!! ERROR BEGIN/END pop != push {} != {}\n{}".format(popargs, args, self. stackbegin) + message = "!! ERROR mismatch in BEGIN/END labels pop!=push '{}'!='{}'\n{}".format(popargs, args, self. stackbegin) print(message) print(message,file=flinear) print(message,file=sys.stderr) @@ -701,7 +705,7 @@ def FindAndLoadSurvex(survexblockroot): fcollate.write(";*include {}\n".format(survexfileroot.path)) flinear.write("{:2} {} *include {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path)) #---------------------------------------------------------------- - svx_scan.RecursiveScan(survexblockroot, survexfileroot, finroot, flinear, fcollate) + svx_scan.RecursiveScan(survexblockroot, survexfileroot.path, finroot, flinear, fcollate) #---------------------------------------------------------------- flinear.write("{:2} {} *edulcni {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path)) fcollate.write(";*edulcni {}\n".format(survexfileroot.path)) @@ -712,7 +716,7 @@ def FindAndLoadSurvex(survexblockroot): flinear.write(" - {:,} survex files in linear include list \n".format(len(svxfileslist))) flinear.close() fcollate.close() - svx_scan = None + svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? print("\n - {:,} survex files in linear include list \n".format(len(svxfileslist)),file=sys.stderr) mem1 = models.get_process_memory() @@ -724,7 +728,7 @@ def FindAndLoadSurvex(survexblockroot): # entrance locations currently loaded after this by LoadPos(), but could better be done before ? # look in MapLocations() for how we find the entrances - print('\n - Loading All Survex Blocks (LinearRecursive)',file=sys.stderr) + print('\n - Loading All Survex Blocks (LinearLoad)',file=sys.stderr) svx_load = LoadingSurvex() svx_load.svxdirs[""] = survexfileroot.survexdirectory |