clean up de-duplication code

author: Philip Sargent <philip.sargent@gmail.com> 2023-02-28 16:18:29 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2023-02-28 16:18:29 +0000
commit: dc03016dbeb136b94d702770e1495c4a8a99a3f6 (patch)
tree: 4101c0fc0d5cb0420c7e050fea371e8c37f42efe /parsers/survex.py
parent: 5067ef2c8cb1f9a1c629dcb697b58f3d8347ffcf (diff)
download: troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.gz
troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.bz2
troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.zip
1 files changed, 140 insertions, 75 deletions
diff --git a/parsers/survex.py b/parsers/survex.py
index 3cf3168..5ebf555 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -46,8 +46,10 @@ survexomitsroot = None
 ROOTBLOCK = "rootblock"
 OMITBLOCK = "omitblock"
 METRESINFEET = 3.28084
+UNSEENS = "_unseens.svx"
 
 stop_dup_warning = False
+dup_includes = 1
 debugprint = False  # Turns on debug printout for just one *include file
 debugprinttrigger = "!"
 
@@ -260,8 +262,8 @@ class LoadingSurvex:
 
     rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
     rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
-    rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$")  # inserted by linear collate ;*include
-    rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$")  # inserted by linear collate ;*edulcni
+    rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$")  # inserted by linear collate ;|*include
+    rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$")  # inserted by linear collate ;|*edulcni
     rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
     rx_include2 = re.compile("(?i)include$")
     rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
@@ -300,7 +302,7 @@ class LoadingSurvex:
     stacksvxfiles = []
     svxfileslist = []
     svxdirs = {}
-    uniquename = {}
+    uniquefile = {}
     expos = {}
     survexdict = {}  # each key is a directory, and its value is a list of files
     lineno = 0
@@ -1163,9 +1165,16 @@ class LoadingSurvex:
         """Creates SurvexFile in the database, and SurvexDirectory if needed
         with links to 'cave'
         Creates a new current survexfile and valid .survexdirectory
-        Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know
+        Inspects the parent folder of the survexfile and uses that to decide if this is 
+        a cave we know.
+        
+        If we see a duplicate cave, this is too late. It has already been included into the
+        long linear file. This needs to be prevented when the long linear file is created.
+        
         The survexblock passed-in is not necessarily the parent. FIX THIS.
         """
+        global dup_includes
+        
         if debugprint:
             print(f" # datastack in  LoadSurvexFile:{svxid} 'type':", end="")
             for dict in self.datastack:
@@ -1173,10 +1182,20 @@ class LoadingSurvex:
             print("")
 
         depth = " " * self.depthbegin
-        # print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
+        print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
         headpath = os.path.dirname(svxid)
 
-        newfile = SurvexFile(path=svxid)
+        newfile, created = SurvexFile.objects.update_or_create(path=svxid)
+        if not created:
+            dup_includes += 1
+            message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
+            print(message)
+            # print(message, file=sys.stderr)
+            stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
+            
+            self.currentsurvexfile = newfile
+            return # abort as everything already done for object creation 
+        
         newfile.save()  # until we do this there is no internal id so no foreign key works
         self.currentsurvexfile = newfile
         newdirectory = self.GetSurvexDirectory(headpath)
@@ -1217,7 +1236,11 @@ class LoadingSurvex:
                 print(f"'{dict['type'].upper()}'   ", end="")
             print("")
 
+
     def ProcessIncludeLine(self, included):
+        """As we read the long linear file, we come across lines telling us that the
+        content from this point on is from a particular included file
+        """
         global debugprint
         svxid = included.groups()[0]
         if svxid.lower() == debugprinttrigger.lower():
@@ -1226,7 +1249,9 @@ class LoadingSurvex:
         self.stacksvxfiles.append(self.currentsurvexfile)
 
     def ProcessEdulcniLine(self, edulcni):
-        """Saves the current survexfile in the db"""
+        """As we read the long linear file, we come across lines telling us that the
+        we are about to pop back out of the contents of an included file
+        Saves the current survexfile object in the db to include the data parsed from it"""
         global debugprint
         svxid = edulcni.groups()[0]
         if debugprint:
@@ -1277,8 +1302,8 @@ class LoadingSurvex:
                     )
 
         included = self.rx_comminc.match(comment)
-        # ;*include means 'we have been included'; whereas *include means 'proceed to include'
-        # bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include
+        # ;|*include means 'we have been included'; whereas *include means 'proceed to include'
+        # No test here to check that this file has not already been included. Ouch.
         if included:
             self.ProcessIncludeLine(included)
 
@@ -1553,7 +1578,7 @@ class LoadingSurvex:
                 self.lineno += 1
                 sline, comment = self.rx_comment.match(svxline).groups()
                 if comment:
-                    # this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too
+                    # this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
                     self.LoadSurvexComment(survexblock, comment)
 
                 if not sline:
@@ -1616,40 +1641,40 @@ class LoadingSurvex:
                 if self.rx_include2.match(cmd):
                 # rx_include2 = re.compile("(?i)include$")
                 # if re.match("(?i)include$", cmd):
-                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
-
-                    fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
-                    self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
-                    self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
-                    if os.path.isfile(fullpath):
-                        # --------------------------------------------------------
-                        self.depthinclude += 1
-                        # fininclude = open(fullpath,'r')
-                        finincludename = fullpath
-                        fcollate.write(f";|*include {includepath}\n")
-                        flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
-                        push = includepath.lower()
-                        self.includestack.append(push)
-                        # -----------------
-                        self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
-                        # -----------------
-                        pop = self.includestack.pop()
-                        if pop != push:
-                            message = "!! ERROR mismatch *include pop!=push  {}".format(pop, push, self.includestack)
+                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
+                    if self.never_seen(includepath, path):
+                        fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
+                        self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
+                        self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
+                        if os.path.isfile(fullpath):
+                            # --------------------------------------------------------
+                            self.depthinclude += 1
+                            # fininclude = open(fullpath,'r')
+                            finincludename = fullpath
+                            fcollate.write(f";|*include {includepath}\n")
+                            flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
+                            push = includepath.lower()
+                            self.includestack.append(push)
+                            # -----------------
+                            self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
+                            # -----------------
+                            pop = self.includestack.pop()
+                            if pop != push:
+                                message = "!! ERROR mismatch *include pop!=push  {}".format(pop, push, self.includestack)
+                                print(message)
+                                print(message, file=flinear)
+                                print(message, file=sys.stderr)
+                                stash_data_issue(parser="survex", message=message, url=None, sb=(path))
+                            flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
+                            fcollate.write(f";|*edulcni {pop}\n")
+                            # fininclude.close()
+                            self.depthinclude -= 1
+                            # --------------------------------------------------------
+                        else:
+                            message = f"    ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
                             print(message)
-                            print(message, file=flinear)
                             print(message, file=sys.stderr)
                             stash_data_issue(parser="survex", message=message, url=None, sb=(path))
-                        flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
-                        fcollate.write(f";|*edulcni {pop}\n")
-                        # fininclude.close()
-                        self.depthinclude -= 1
-                        # --------------------------------------------------------
-                    else:
-                        message = f"    ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
-                        print(message)
-                        print(message, file=sys.stderr)
-                        stash_data_issue(parser="survex", message=message, url=None, sb=(path))
                 elif self.rx_begin2.match(cmd):
                 #elif re.match("(?i)begin$", cmd):
                     self.depthbegin += 1
@@ -1733,20 +1758,39 @@ class LoadingSurvex:
             print(message)
             print(message, file=sys.stderr)
             stash_data_issue(parser="survex", message=message, url=None, sb=(path))
+            raise
             return  # skip this survex file and all things *included in it
 
-    def checkUniqueness(self, fullpath):
-        fn = Path(fullpath).name
-        if fn not in self.uniquename:
-            self.uniquename[fn] = [fullpath]
-        else:
-            self.uniquename[fn].append(fullpath)
-            # This is not an error now that we are moving .3d files to the :loser: directory tree
+    def never_seen(self, incpath, parent):
+        """The _unseen files may include survex files we have already seen, and we do not
+        want to process them again. For the _unseens this is not an error, but for the main
+        *include tree it is an error.
+        """
+ 
+        if incpath in self.uniquefile:
+            self.uniquefile[incpath].append(parent)
+            
             message = (
-                f" NOTE:  non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}"
+                f" DUP:  non-unique survex filepath, '{incpath}' -  #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
             )
-            # print(message)
+            print(message)
             # stash_data_issue(parser='survex', message=message)
+            for p in self.uniquefile[incpath]:
+                if p in self.uniquefile:
+                    print(f"{p} <- {self.uniquefile[p]}")
+            return False
+        else:
+            self.uniquefile[incpath] = [parent]
+            return True
+            
+    def check_unique_name(self, fullpath):
+        """This only checks whether the last bit of the name of the survex file is unique,
+        e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
+        
+        We don't care about this any more.
+        """
+        return
+        
 
     def RunSurvexIfNeeded(self, fullpath, calledpath):
         now = time.time()
@@ -1843,7 +1887,13 @@ class LoadingSurvex:
 
 
 def FindAndLoadSurvex(survexblockroot):
-    """Follows the *include links successively to find files in the whole include tree"""
+    """Follows the *include links successively to find survex files
+    This proceeds in 3 phases:
+    1. The root survex file is read and all the *include files are found, using PushdownStackScan()
+    2. All the other survex files in the :loser: repo are found, and their *includes found,
+       using another PushdownStackScan() [duplicates omitted]
+    3. The combined expanded file containing all the survex data is parsed as a single file,
+       using LinearLoad()"""
     global stop_dup_warning
     print("  - redirecting stdout to svxblks.log...")
     stdout_orig = sys.stdout
@@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):
 
     print(f"  - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
     svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
-    svx_scan.checkUniqueness(fullpathtotop)
+    svx_scan.check_unique_name(fullpathtotop)
+    svx_scan.uniquefile[str(survexfileroot)] = ["0"]
 
     indent = ""
     fcollate = open(collatefilename, "w")
 
     mem0 = get_process_memory()
-    print(f"  - MEM:{mem0:7.2f} MB START", file=sys.stderr)
+    print(f"  - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
     flinear = open("svxlinear.log", "w")
-    flinear.write(f"    - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n")
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
     print("    ", file=sys.stderr, end="")
 
     finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
@@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
     flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
     flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
     flinear.write(f"    - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
-
+    flinear.write(f"    - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
+    for j in svx_scan.svxfileslist:
+        if j not in svx_scan.uniquefile:
+            flinear.write(f"    - '{j}' {type(j)} not in unique list  \n")            
+    for f in svx_scan.uniquefile:
+        # flinear.write(f"    - '{f}'  {type(f)} {svx_scan.uniquefile[f]}   \n") 
+        if len(svx_scan.uniquefile[f]) > 1:
+            flinear.write(f"    - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files  \n")
+           
     print(f"\n  -  {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
     print(f"  -  {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
-
+    print(f"  -  {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
     mem1 = get_process_memory()
     print(f"  - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
     print(f"  - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
+    
     #
     # Process all the omitted files in :loser: with some exceptions
-    #
     unseens = set()
     b = []
 
@@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
             file=sys.stderr,
         )
 
-    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
+    unseensroot = re.sub(r"\.svx$", "", UNSEENS)
+    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
     removals = []
     for x in unseens:
         for o in excpts:
             if str(x).strip().startswith(o):
                 removals.append(x)
-    # special fix for file not actually in survex format
+    # special fix for .svx file not actually in survex format
     unseens.remove(Path("fixedpts/gps/gps00raw"))
 
     for x in removals:
@@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
     check_team_cache()
     print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)
 
-    with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u:
+    with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
         u.write(
             f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
         )
@@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
     omit_scan = LoadingSurvex()
     omit_scan.callcount = 0
     omit_scan.depthinclude = 0
-    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx")
+    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)
 
     # copy the list to prime the next pass through the files
     omit_scan.svxfileslist = svx_scan.svxfileslist[:]
@@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):
 
     print(f"  - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
     omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
-    omit_scan.checkUniqueness(fullpathtotop)
+    omit_scan.check_unique_name(fullpathtotop)
+    omit_scan.uniquefile[unseensroot] = ["0"]
 
     mem0 = get_process_memory()
-    print(f"  - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr)
+    print(f"  - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
     # flinear = open('svxlinear.log', 'w')
-    flinear.write(f"    - MEM:{mem0:7.2f} MB START '_unseens'\n")
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
     print("    ", file=sys.stderr, end="")
 
+    # this is a bit tricky as some unseen files will *include files we have already seen, which 
+    # we should not process again.
     finrootname = fullpathtotop
-    fcollate.write(";*include _unseens.svx\n")
-    flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n")
-    stop_dup_warning = True
+    fcollate.write(f";*include {UNSEENS}\n")
+    flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
+    # stop_dup_warning = True
     # ----------------------------------------------------------------
-    omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate)
+    omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
     # ----------------------------------------------------------------
-    stop_dup_warning = False
+    # stop_dup_warning = False
 
-    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n")
-    fcollate.write(";*edulcni _unseens.svx\n")
+    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
+    fcollate.write(f";*edulcni {UNSEENS}\n")
     
     check_team_cache()
 
     mem1 = get_process_memory()
-    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n")
-    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n")
-    flinear.write(f"    - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n")
+    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
+    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
+    flinear.write(f"    - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")
 
     flinear.close()
     fcollate.close()
@@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):
 
 
 def LoadSurvexBlocks():
+    global dup_includes
     mem1 = get_process_memory()
     print(f"  - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
     start = time.time()
@@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
     # sudo service mariadb start
     survexblockroot.save()
 
-    omitsfileroot = MakeOmitFileRoot("_unseens.svx")
+    omitsfileroot = MakeOmitFileRoot(UNSEENS)
     survexomitsroot = SurvexBlock(
         name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
     )
@@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
     store_data_issues()
     # duration = time.time() - start
     # print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
+    print(f" - Duplicate *includes = {dup_includes}")
     print(" - Loaded All Survex Blocks.")
author	Philip Sargent <philip.sargent@gmail.com>	2023-02-28 16:18:29 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2023-02-28 16:18:29 +0000
commit	dc03016dbeb136b94d702770e1495c4a8a99a3f6 (patch)
tree	4101c0fc0d5cb0420c7e050fea371e8c37f42efe /parsers/survex.py
parent	5067ef2c8cb1f9a1c629dcb697b58f3d8347ffcf (diff)
download	troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.gz troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.tar.bz2 troggle-dc03016dbeb136b94d702770e1495c4a8a99a3f6.zip