summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--parsers/survex.py206
1 files changed, 141 insertions, 65 deletions
diff --git a/parsers/survex.py b/parsers/survex.py
index 1f97dd0..57ae52c 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -19,18 +19,11 @@ from troggle.parsers.logbooks import GetCaveLookup
from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.models.survex import SurvexPersonRole, Wallet, SurvexDirectory, SurvexFile, SurvexBlock, SurvexStation
-'''Imports the tree of survex files following form a defined root .svx file
-It does also NOT scan the Loser repo for all the svx files - though it should !
+'''Imports the tree of survex files following from a defined root .svx file
+It also scans the Loser repo for all the svx files, which it loads individually afterwards.
'''
-todo = '''Also walk the entire tree in the :loser: repo looking for unconnected survex files
-- add them to the system so that they can be reported-on
-- produce a parser report and create a troggle report page (some are OK, e.g. futility series replaced by ARGE survey in 115)
-
-- If you look at e.g. http://expo.survex.com/survexfile/161#T_caves-1623/161/lhr/alllhr
- you will see than have the team members are recognised by this parser, but not recognised by the
- wider troggle system (the name is not a hyperlink) - apparently randomly.
- GetPersonExpeditionNameLookup() needs to be fixed.
+todo = '''
-#BUG, if *date comes after *team, the person's date is not set at all. It needs re-setting at the endof the block.
@@ -105,7 +98,8 @@ def get_people_on_trip(survexblock):
for p in qpeople:
people.append(f'{p.personname}')
return list(set(people))
-
+
+
class LoadingSurvex():
"""A 'survex block' is a *begin...*end set of cave data.
A survex file can contain many begin-end blocks, which can be nested, and which can *include
@@ -128,8 +122,10 @@ class LoadingSurvex():
rx_names = re.compile(r'(?i)names')
rx_flagsnot= re.compile(r"not\s")
rx_linelen = re.compile(r"[\d\-+.]+$")
- instruments = "(waiting_patiently|slacker|Useless|nagging|unknown|Inst|instrument|rig|rigger|rigging|helper|something| compass|comp|clino|Notes|sketch|book|Tape|Dog|Pics|photo|drawing|Helper|GPS|Disto|Distox|Distox2|topodroid|point|Consultant|nail|polish|nail_polish_bitch|nail_polish_monkey|varnish|nail_polish|nail_varnish|bitch|monkey|PowerDrill|drill)"
- rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)"+instruments+"?(?:es|s)?$")
+ instruments = "(bitch|bodger|bolt|bolter|bolting|book|clino|comp|compass|consultant|disto|distox|distox2|dog|dogsbody|drawing|drill|gps|helper|inst|instr|instrument|monkey|nagging|nail|nail_polish|nail_polish_bitch|nail_polish_monkey|nail_varnish|nail_varnish_bitch|note|paint|photo|pic|point|polish|powerdrill|rig|rigger|rigging|sketch|slacker|something|tape|topodroid|unknown|useless|varnish|waiting_patiently)"
+ rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)$")
+ rx_teamold = re.compile(r"(?i)(.*)\s+"+instruments+"?(?:es|s)?$")
+ rx_teamabs = re.compile(r"(?i)^\s*("+instruments+")?(?:es|s)?\s*$")
rx_person = re.compile(r"(?i) and | / |, | & | \+ |^both$|^none$")
rx_qm = re.compile(r'(?i)^\s*QM(\d+)\s+?([a-dA-DxX])\s+([\w\-\_]+)\.([\w\.\-]+)\s+(([\w\-]+)\.([\w\.\-]+)|\-)\s+(.+)$')
# does not recognise non numeric suffix survey point ids
@@ -228,22 +224,74 @@ class LoadingSurvex():
personrole is used to record that a person was on a trip, NOT the role they played.
(NB PersonTrip is a logbook thing)
"""
- teammembers = [ ]
- mteammember = self.rx_teammem.match(line)
- if mteammember:
+ def record_team_member(tm, survexblock):
+ tm = tm.strip('\"\'')
+ # Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
+ # This is convoluted, the whole personexpedition concept is unnecessary.
+
+ # we need the current expedition, but if there has been no date yet in the survex file, we don't know which one it is.
+ # so we can't validate whether the person was on expo or not.
+ # we will have to attach them to the survexblock anyway, and then do a
+ # later check on whether they are valid when we get the date.
+
+ personrole, created = SurvexPersonRole.objects.update_or_create(survexblock=survexblock, personexpedition=personexpedition, personname=tm)
+
+ expo = survexblock.expedition # may be None if no *date yet
+ # this syntax was bizarre.. made more obvious
+ if expo:
+ if survexblock.expeditionday: # *date has been set
+ personrole.expeditionday = survexblock.expeditionday
+ else:
+ # should not happen
+ message = "! *team {} expo ok, expedition day not in *team {} ({}) created? '{}'".format(expo.year, survexblock.survexfile.path, survexblock, created )
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+
+
+ personexpedition = GetPersonExpeditionNameLookup(expo).get(tm.lower())
+ personrole.person=personexpedition.person
+ self.currentpersonexped.append(personexpedition)
+
+ if not personexpedition:
+ # we know the date and expo, but can't find the person
+ message = "! *team {} '{}' FAIL personexpedition lookup on *team {} ({}) in '{}' {} ".format(expo.year, tm, survexblock.survexfile.path, survexblock, created, line)
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+ else:
+ personexpedition = None
+ # don't know the date yet, assume the person is valid. It wull get picked up with the *date appears
+
+ personrole.save()
+
+ mteammember = self.rx_teammem.match(line) # matches the role at the beginning
+ if not mteammember:
+ moldstyle = self.rx_teamold.match(line) # matches the role at the the end of the string
+ if moldstyle:
+ for tm in self.rx_person.split(moldstyle.group(1)):
+ if tm:
+ record_team_member(tm, survexblock)
+ # seems to be working
+ # msg = "! OLD tm='{}' line: '{}' ({}) {}".format(tm, line, survexblock, survexblock.survexfile.path)
+ # print(msg, file=sys.stderr)
+ else:
+ message = "! *team {} ({}) Weird '{}' oldstyle line: '{}'".format(survexblock.survexfile.path, survexblock, mteammember.group(1), line)
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+ else:
+ nullmember = self.rx_teamabs.match(line) # matches empty role line. Ignore these.
+ if not nullmember:
+ message = "! *team {} ({}) Bad line: '{}'".format(survexblock.survexfile.path, survexblock, line)
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+ else:
for tm in self.rx_person.split(mteammember.group(2)):
if tm:
- tm = tm.strip('\"\'')
- personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower())
- if (personexpedition, tm) not in teammembers:
- teammembers.append((personexpedition, tm))
- personrole = SurvexPersonRole(survexblock=survexblock, personexpedition=personexpedition, personname=tm)
- personrole.save()
- personrole.expeditionday = survexblock.expeditionday #BUG, if *date comes after *team, this is NOT SET.
- if personexpedition:
- personrole.person=personexpedition.person
- self.currentpersonexped.append(personexpedition)
- personrole.save()
+ record_team_member(tm, survexblock)
+ else:
+ if not mteammember.group(2).lower() in ('none', 'both'):
+ message = "! Weird *team '{}' newstyle line: '{}' ({}) {}".format(mteammember.group(2), line, survexblock, survexblock.survexfile.path)
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
def LoadSurvexEntrance(self, survexblock, line):
# Not using this yet
@@ -284,29 +332,53 @@ class LoadingSurvex():
message = "! *UNITS in YARDS!? - not converted '{}' ({}) {}".format(line, survexblock, survexblock.survexfile.path)
print(self.insp+message)
DataIssue.objects.create(parser='survexunits', message=message)
-
+
+ def get_expo_from_year(self, year):
+ # cacheing to save DB query on every block
+ if year in self.expos:
+ expo = self.expos[year]
+ else:
+ expeditions = Expedition.objects.filter(year=year)
+ if len(expeditions) != 1 :
+ message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}"
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+
+ expo= expeditions[0]
+ self.expos[year]= expo
+ return expo
+
def LoadSurvexDate(self, survexblock, line):
# we should make this a date RANGE for everything?
- def findexpedition(year):
- return Expedition.objects.filter(year=year)
- def setdate(year):
- # cacheing to save DB query on every block
- if year in self.expos:
- expo = self.expos[year]
- else:
- expeditions = findexpedition(year)
- if len(expeditions) != 1 :
- message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}"
- print(self.insp+message)
- DataIssue.objects.create(parser='survexunits', message=message)
-
- expo= expeditions[0]
- self.expos[year]= expo
-
+ def setdate_on_survexblock(year):
+ expo = self.get_expo_from_year(year)
survexblock.expedition = expo
- survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date)
+ survexblock.expeditionday = expo.get_expedition_day(survexblock.date)
survexblock.save()
+
+ team = SurvexPersonRole.objects.filter(survexblock=survexblock)
+ for p in team:
+ if not p.expeditionday: # *date and *team in 'wrong' order. All working now.
+
+ p.expeditionday = survexblock.expeditionday
+ p.save()
+
+ if not p.personexpedition: # again, we didn't know the date until now
+ pe = GetPersonExpeditionNameLookup(expo).get(p.personname.lower())
+ if pe:
+ # message = "! {} ({}) Fixing undated personexpedition '{}'".format(survexblock.survexfile.path, survexblock, p.personname)
+ # print(self.insp+message)
+ # DataIssue.objects.create(parser='survex', message=message)
+ p.personexpedition = pe
+ p.person = p.personexpedition.person
+ p.save()
+ else:
+ message = "! *team {} '{}' FAIL personexpedition lookup on *date {} ({}) '{}'".format(year, p, survexblock.survexfile.path, survexblock, p.personname)
+ print(self.insp+message)
+ DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+
+
oline = line
if len(line) > 10:
@@ -320,7 +392,7 @@ class LoadingSurvex():
# TO DO set to correct Austrian timezone Europe/Vienna ?
# %m and %d need leading zeros. Source svx files require them.
survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m-%d')
- setdate(year)
+ setdate_on_survexblock(year)
elif len(line) == 7:
year = line[:4]
perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ?
@@ -328,7 +400,7 @@ class LoadingSurvex():
print(self.insp+message)
DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path))
survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month
- setdate(year)
+ setdate_on_survexblock(year)
elif len(line) == 4:
year = line[:4]
perps = get_people_on_trip(survexblock)
@@ -336,13 +408,13 @@ class LoadingSurvex():
print(self.insp+message)
DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path))
survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st
- setdate(year)
+ setdate_on_survexblock(year)
else:
# these errors are reporting the wrong survexblock, which is actually a SurvexFile (!)
- message = "! DATE Error unrecognised '{}' ({}) {}".format(oline, survexblock, survexblock.survexfile.path)
+ message = "! DATE Error unrecognised '{}-{}' ({}) {}".format(oline, survexblock, type(survexblock), survexblock.survexfile.path)
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
- print(f" {survexblock.parent=}") # fails as SUrvexFile has no .parent ...ugh.
+ print(f" {type(survexblock)=}") # survexblock.parent fails as a SurvexFile has no .parent ...ugh.
print(f" {survexblock.survexpath=}")
print(f" {survexblock.survexfile=}")
#raise
@@ -976,6 +1048,7 @@ class LoadingSurvex():
def LinearLoad(self, survexblock, path, collatefilename):
"""Loads a single survex file. Usually used to import all the survex files which have been collated
into a single file. Loads the begin/end blocks using a stack for labels.
+ Uses the python generator idiom to avoid loading the whole file (21MB) into memory.
"""
blkid = None
pathlist = None
@@ -1192,7 +1265,7 @@ class LoadingSurvex():
# this is a python generator idiom.
# see https://realpython.com/introduction-to-python-generators/
- # this is the first use of generators in troggle (Oct.2022)
+ # this is the first use of generators in troggle (Oct.2022) and save 21 MB of memory
with open(collatefilename, "r") as fcollate:
for svxline in fcollate:
self.lineno += 1
@@ -1349,7 +1422,7 @@ class LoadingSurvex():
return
return
try:
- # python generator idiom again
+ # python generator idiom again. Not important here as these are small files
with open(finname, "r") as fin:
for svxline in fin:
process_line(svxline)
@@ -1475,7 +1548,6 @@ def FindAndLoadSurvex(survexblockroot):
survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
collatefilename = "_" + survexfileroot.path + ".svx"
-
svx_scan = LoadingSurvex()
svx_scan.callcount = 0
svx_scan.depthinclude = 0
@@ -1502,7 +1574,6 @@ def FindAndLoadSurvex(survexblockroot):
from pstats import SortKey
pr = cProfile.Profile()
pr.enable()
- #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr)
#----------------------------------------------------------------
svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate)
#----------------------------------------------------------------
@@ -1524,8 +1595,7 @@ def FindAndLoadSurvex(survexblockroot):
mem1 = get_process_memory()
print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
- print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
-
+ print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
#
# Process all the omitted files in :loser: with some exceptions
#
@@ -1556,8 +1626,6 @@ def FindAndLoadSurvex(survexblockroot):
for x in removals:
unseens.remove(x)
- # for x in unseens:
- # print(f"'{x}', ", end='', file=sys.stderr)
print(f"\n - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr)
@@ -1621,7 +1689,6 @@ def FindAndLoadSurvex(survexblockroot):
print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
-
# Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the
# entrance locations currently loaded after this by LoadPos(), but could better be done before ?
# look in MapLocations() for how we find the entrances
@@ -1635,18 +1702,15 @@ def FindAndLoadSurvex(survexblockroot):
#pr2 = cProfile.Profile()
#pr2.enable()
- mem1 = get_process_memory()
- print(f" - MEM:{mem1:7.2f} MB NOT reading '{collatefilename}' into memory.",file=sys.stderr)
print(" ", file=sys.stderr,end='')
#----------------------------------------------------------------
- svx_load.LinearLoad(survexblockroot,survexfileroot.path, collatefilename)
+ svx_load.LinearLoad(survexblockroot, survexfileroot.path, collatefilename)
#----------------------------------------------------------------
#pr2.disable()
# with open('LinearLoad.prof', 'w') as f:
# ps = pstats.Stats(pr2, stream=f)
# ps.sort_stats(SortKey.CUMULATIVE)
# ps.print_stats()
- svxlines = [] # empty 30MB of stashed file
mem1 = get_process_memory()
print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr)
print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
@@ -1696,21 +1760,33 @@ def MakeOmitFileRoot(fn):
return fileroot
def LoadSurvexBlocks():
+ mem1 = get_process_memory()
+ print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)
print(' - Flushing All Survex Blocks...')
+ # why does this increase memory use by 20 MB ?!
+ # We have foreign keys, Django needs to load the related objects
+ # in order to resolve how the relation should handle the deletion:
+ # https://docs.djangoproject.com/en/3.2/ref/models/fields/#django.db.models.ForeignKey.on_delete
SurvexBlock.objects.all().delete()
SurvexFile.objects.all().delete()
SurvexDirectory.objects.all().delete()
SurvexPersonRole.objects.all().delete()
SurvexStation.objects.all().delete()
- print(" - survex Data Issues flushed")
+ mem1 = get_process_memory()
+ print(" - MEM:{:7.2f} MB now. Foreign key objects loaded on deletion. ".format(mem1),file=sys.stderr)
+
+ print(" - Flushing survex Data Issues ")
DataIssue.objects.filter(parser='survex').delete()
DataIssue.objects.filter(parser='svxdate').delete()
DataIssue.objects.filter(parser='survexleg').delete()
DataIssue.objects.filter(parser='survexunits').delete()
DataIssue.objects.filter(parser='entrances').delete()
DataIssue.objects.filter(parser='xEntrances').delete()
-
+ print(" - survex Data Issues flushed")
+ mem1 = get_process_memory()
+ print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)
+
survexfileroot = MakeSurvexFileRoot()
# this next makes a block_object assciated with a file_object.path = SURVEX_TOPNAME
survexblockroot = SurvexBlock(name=ROOTBLOCK, survexpath="", cave=None, survexfile=survexfileroot,