summaryrefslogtreecommitdiffstats
path: root/parsers/survex.py
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@gmail.com>2023-02-28 16:18:29 +0000
committerPhilip Sargent <philip.sargent@gmail.com>2023-02-28 16:18:29 +0000
commitdc03016dbeb136b94d702770e1495c4a8a99a3f6 (patch)
tree4101c0fc0d5cb0420c7e050fea371e8c37f42efe /parsers/survex.py
parent5067ef2c8cb1f9a1c629dcb697b58f3d8347ffcf (diff)
downloadtroggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.gz
troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.bz2
troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.zip
clean up de-duplication code
Diffstat (limited to 'parsers/survex.py')
-rw-r--r--parsers/survex.py215
1 files changed, 140 insertions, 75 deletions
diff --git a/parsers/survex.py b/parsers/survex.py
index 3cf3168..5ebf555 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -46,8 +46,10 @@ survexomitsroot = None
ROOTBLOCK = "rootblock"
OMITBLOCK = "omitblock"
METRESINFEET = 3.28084
+UNSEENS = "_unseens.svx"
stop_dup_warning = False
+dup_includes = 1
debugprint = False # Turns on debug printout for just one *include file
debugprinttrigger = "!"
@@ -260,8 +262,8 @@ class LoadingSurvex:
rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
- rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;*include
- rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;*edulcni
+ rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;|*include
+ rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;|*edulcni
rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
rx_include2 = re.compile("(?i)include$")
rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
@@ -300,7 +302,7 @@ class LoadingSurvex:
stacksvxfiles = []
svxfileslist = []
svxdirs = {}
- uniquename = {}
+ uniquefile = {}
expos = {}
survexdict = {} # each key is a directory, and its value is a list of files
lineno = 0
@@ -1163,9 +1165,16 @@ class LoadingSurvex:
"""Creates SurvexFile in the database, and SurvexDirectory if needed
with links to 'cave'
Creates a new current survexfile and valid .survexdirectory
- Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know
+ Inspects the parent folder of the survexfile and uses that to decide if this is
+ a cave we know.
+
+ If we see a duplicate cave, this is too late. It has already been included into the
+ long linear file. This needs to be prevented when the long linear file is created.
+
The survexblock passed-in is not necessarily the parent. FIX THIS.
"""
+ global dup_includes
+
if debugprint:
print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="")
for dict in self.datastack:
@@ -1173,10 +1182,20 @@ class LoadingSurvex:
print("")
depth = " " * self.depthbegin
- # print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
+ print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
headpath = os.path.dirname(svxid)
- newfile = SurvexFile(path=svxid)
+ newfile, created = SurvexFile.objects.update_or_create(path=svxid)
+ if not created:
+ dup_includes += 1
+ message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
+ print(message)
+ # print(message, file=sys.stderr)
+ stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
+
+ self.currentsurvexfile = newfile
+ return # abort as everything already done for object creation
+
newfile.save() # until we do this there is no internal id so no foreign key works
self.currentsurvexfile = newfile
newdirectory = self.GetSurvexDirectory(headpath)
@@ -1217,7 +1236,11 @@ class LoadingSurvex:
print(f"'{dict['type'].upper()}' ", end="")
print("")
+
def ProcessIncludeLine(self, included):
+ """As we read the long linear file, we come across lines telling us that the
+ content from this point on is from a particular included file
+ """
global debugprint
svxid = included.groups()[0]
if svxid.lower() == debugprinttrigger.lower():
@@ -1226,7 +1249,9 @@ class LoadingSurvex:
self.stacksvxfiles.append(self.currentsurvexfile)
def ProcessEdulcniLine(self, edulcni):
- """Saves the current survexfile in the db"""
+ """As we read the long linear file, we come across lines telling us that the
+ we are about to pop back out of the contents of an included file
+ Saves the current survexfile object in the db to include the data parsed from it"""
global debugprint
svxid = edulcni.groups()[0]
if debugprint:
@@ -1277,8 +1302,8 @@ class LoadingSurvex:
)
included = self.rx_comminc.match(comment)
- # ;*include means 'we have been included'; whereas *include means 'proceed to include'
- # bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include
+ # ;|*include means 'we have been included'; whereas *include means 'proceed to include'
+ # No test here to check that this file has not already been included. Ouch.
if included:
self.ProcessIncludeLine(included)
@@ -1553,7 +1578,7 @@ class LoadingSurvex:
self.lineno += 1
sline, comment = self.rx_comment.match(svxline).groups()
if comment:
- # this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too
+ # this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
self.LoadSurvexComment(survexblock, comment)
if not sline:
@@ -1616,40 +1641,40 @@ class LoadingSurvex:
if self.rx_include2.match(cmd):
# rx_include2 = re.compile("(?i)include$")
# if re.match("(?i)include$", cmd):
- includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
-
- fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
- self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
- self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
- if os.path.isfile(fullpath):
- # --------------------------------------------------------
- self.depthinclude += 1
- # fininclude = open(fullpath,'r')
- finincludename = fullpath
- fcollate.write(f";|*include {includepath}\n")
- flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
- push = includepath.lower()
- self.includestack.append(push)
- # -----------------
- self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
- # -----------------
- pop = self.includestack.pop()
- if pop != push:
- message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
+ includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
+ if self.never_seen(includepath, path):
+ fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
+ self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
+ self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
+ if os.path.isfile(fullpath):
+ # --------------------------------------------------------
+ self.depthinclude += 1
+ # fininclude = open(fullpath,'r')
+ finincludename = fullpath
+ fcollate.write(f";|*include {includepath}\n")
+ flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
+ push = includepath.lower()
+ self.includestack.append(push)
+ # -----------------
+ self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
+ # -----------------
+ pop = self.includestack.pop()
+ if pop != push:
+ message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
+ print(message)
+ print(message, file=flinear)
+ print(message, file=sys.stderr)
+ stash_data_issue(parser="survex", message=message, url=None, sb=(path))
+ flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
+ fcollate.write(f";|*edulcni {pop}\n")
+ # fininclude.close()
+ self.depthinclude -= 1
+ # --------------------------------------------------------
+ else:
+ message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
print(message)
- print(message, file=flinear)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
- flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
- fcollate.write(f";|*edulcni {pop}\n")
- # fininclude.close()
- self.depthinclude -= 1
- # --------------------------------------------------------
- else:
- message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
- print(message)
- print(message, file=sys.stderr)
- stash_data_issue(parser="survex", message=message, url=None, sb=(path))
elif self.rx_begin2.match(cmd):
#elif re.match("(?i)begin$", cmd):
self.depthbegin += 1
@@ -1733,20 +1758,39 @@ class LoadingSurvex:
print(message)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
+ raise
return # skip this survex file and all things *included in it
- def checkUniqueness(self, fullpath):
- fn = Path(fullpath).name
- if fn not in self.uniquename:
- self.uniquename[fn] = [fullpath]
- else:
- self.uniquename[fn].append(fullpath)
- # This is not an error now that we are moving .3d files to the :loser: directory tree
+ def never_seen(self, incpath, parent):
+ """The _unseen files may include survex files we have already seen, and we do not
+ want to process them again. For the _unseens this is not an error, but for the main
+ *include tree it is an error.
+ """
+
+ if incpath in self.uniquefile:
+ self.uniquefile[incpath].append(parent)
+
message = (
- f" NOTE: non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}"
+ f" DUP: non-unique survex filepath, '{incpath}' - #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
)
- # print(message)
+ print(message)
# stash_data_issue(parser='survex', message=message)
+ for p in self.uniquefile[incpath]:
+ if p in self.uniquefile:
+ print(f"{p} <- {self.uniquefile[p]}")
+ return False
+ else:
+ self.uniquefile[incpath] = [parent]
+ return True
+
+ def check_unique_name(self, fullpath):
+ """This only checks whether the last bit of the name of the survex file is unique,
+ e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
+
+ We don't care about this any more.
+ """
+ return
+
def RunSurvexIfNeeded(self, fullpath, calledpath):
now = time.time()
@@ -1843,7 +1887,13 @@ class LoadingSurvex:
def FindAndLoadSurvex(survexblockroot):
- """Follows the *include links successively to find files in the whole include tree"""
+ """Follows the *include links successively to find survex files
+ This proceeds in 3 phases:
+ 1. The root survex file is read and all the *include files are found, using PushdownStackScan()
+ 2. All the other survex files in the :loser: repo are found, and their *includes found,
+ using another PushdownStackScan() [duplicates omitted]
+ 3. The combined expanded file containing all the survex data is parsed as a single file,
+ using LinearLoad()"""
global stop_dup_warning
print(" - redirecting stdout to svxblks.log...")
stdout_orig = sys.stdout
@@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
- svx_scan.checkUniqueness(fullpathtotop)
+ svx_scan.check_unique_name(fullpathtotop)
+ svx_scan.uniquefile[str(survexfileroot)] = ["0"]
indent = ""
fcollate = open(collatefilename, "w")
mem0 = get_process_memory()
- print(f" - MEM:{mem0:7.2f} MB START", file=sys.stderr)
+ print(f" - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
flinear = open("svxlinear.log", "w")
- flinear.write(f" - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n")
+ flinear.write(f" - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
print(" ", file=sys.stderr, end="")
finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
@@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
-
+ flinear.write(f" - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
+ for j in svx_scan.svxfileslist:
+ if j not in svx_scan.uniquefile:
+ flinear.write(f" - '{j}' {type(j)} not in unique list \n")
+ for f in svx_scan.uniquefile:
+ # flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} \n")
+ if len(svx_scan.uniquefile[f]) > 1:
+ flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files \n")
+
print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
-
+ print(f" - {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
+
#
# Process all the omitted files in :loser: with some exceptions
- #
unseens = set()
b = []
@@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
file=sys.stderr,
)
- excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
+ unseensroot = re.sub(r"\.svx$", "", UNSEENS)
+ excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
removals = []
for x in unseens:
for o in excpts:
if str(x).strip().startswith(o):
removals.append(x)
- # special fix for file not actually in survex format
+ # special fix for .svx file not actually in survex format
unseens.remove(Path("fixedpts/gps/gps00raw"))
for x in removals:
@@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
check_team_cache()
print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)
- with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u:
+ with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
u.write(
f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
)
@@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
omit_scan = LoadingSurvex()
omit_scan.callcount = 0
omit_scan.depthinclude = 0
- fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx")
+ fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)
# copy the list to prime the next pass through the files
omit_scan.svxfileslist = svx_scan.svxfileslist[:]
@@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
- omit_scan.checkUniqueness(fullpathtotop)
+ omit_scan.check_unique_name(fullpathtotop)
+ omit_scan.uniquefile[unseensroot] = ["0"]
mem0 = get_process_memory()
- print(f" - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr)
+ print(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
# flinear = open('svxlinear.log', 'w')
- flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n")
+ flinear.write(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
print(" ", file=sys.stderr, end="")
+ # this is a bit tricky as some unseen files will *include files we have already seen, which
+ # we should not process again.
finrootname = fullpathtotop
- fcollate.write(";*include _unseens.svx\n")
- flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n")
- stop_dup_warning = True
+ fcollate.write(f";*include {UNSEENS}\n")
+ flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
+ # stop_dup_warning = True
# ----------------------------------------------------------------
- omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate)
+ omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
# ----------------------------------------------------------------
- stop_dup_warning = False
+ # stop_dup_warning = False
- flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n")
- fcollate.write(";*edulcni _unseens.svx\n")
+ flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
+ fcollate.write(f";*edulcni {UNSEENS}\n")
check_team_cache()
mem1 = get_process_memory()
- flinear.write(f"\n - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n")
- flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n")
- flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n")
+ flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
+ flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
+ flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")
flinear.close()
fcollate.close()
@@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):
def LoadSurvexBlocks():
+ global dup_includes
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time()
@@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
# sudo service mariadb start
survexblockroot.save()
- omitsfileroot = MakeOmitFileRoot("_unseens.svx")
+ omitsfileroot = MakeOmitFileRoot(UNSEENS)
survexomitsroot = SurvexBlock(
name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
)
@@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
store_data_issues()
# duration = time.time() - start
# print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
+ print(f" - Duplicate *includes = {dup_includes}")
print(" - Loaded All Survex Blocks.")