1 files changed, 183 insertions, 242 deletions
diff --git a/parsers/caves.py b/parsers/caves.py
index 7eba28a..5389a8e 100644
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -27,6 +27,8 @@ todo = """
 - Cannot use Edit This Page for pendingcaves.txt_edit as Edit This Page is expecting an html file.
    So we will need a separate file-editing capability just for this configuration file ?!
    
+- we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
+
 - Semi-automagically import all the 1627- pending caves and create HTML files for them to be
   edited individually. (These are caves we only know about because we have German survex files.)
    
@@ -281,6 +283,9 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
     """Reads a single XML tag
     Should throw exception rather than producing error message here,
     then handle exception in calling routine where it has the context.
+    
+    This always succeeds, but it produices error message on the terminal and in the
+    DatIssues log.
     """
     items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
     if len(items) < minItems:
@@ -300,7 +305,7 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
             + " in file "
             + context
         )
-        DataIssue.objects.create(parser="caves", message=message)
+        DataIssue.objects.create(parser="caves", message=message, url="" + context)
         print(message)
     if minItems == 0:
         if not items:
@@ -315,12 +320,19 @@ def boolify(boolstrs):
             "true": True,
             "false": False}[boolstrs[0]]
 
-def readentrance(filename, ent=None):
-    """Reads an entrance description from the .html file
-    
-    If not called as part of initial import, then the global lists will not be correct
-    but this is OK, a search will find them in the db.
-    """
+def read_entrance(filename, ent=None):
+    """Reads an entrance description from the .html file.
+
+      If not called as part of initial import, then the global lists will not be correct
+      but this is OK, a search will find them in the db.
+
+      Args:
+        filename: The name of the .html file.
+        ent: The entrance object, if it already exists.
+
+      Returns:
+        The entrance object, or a new entrance object if `ent` is None.
+      """
     def getXMLmax1(field):
         return getXML(entrancecontents, field, maxItems=1, context=context)
         
@@ -333,23 +345,22 @@ def readentrance(filename, ent=None):
         contents = f.read()
     context = filename
     
-    # print("Reading file ENTRANCE {} / {}".format(ENTRANCEDESCRIPTIONS, filename))
     entrancecontentslist = getXML(contents, "entrance", maxItems=1, context=context)
     if len(entrancecontentslist) != 1:
-        message = f'! BAD ENTRANCE at "{filename}". Loading aborted. '
-        DataIssue.objects.create(parser="entrances", message=message)
+        message = f'! BAD ENTRANCE DATA in "{filename}". More than one entrance. Edit file manually, click.'
+        DataIssue.objects.create(parser="entrances", message=message, url=f"/entrance_data/{filename}_edit")
         print(message)
-        return
+        return None
 
     entrancecontents = entrancecontentslist[0]
     slugs = getXML(entrancecontents, "slug", context=context)
+    slug = slugs[0]
 
     if len(slugs) >1:
         # Only ever one of these per entrance in the expo dataset
-        message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Aborting."
+        message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Ignoring all except first."
         DataIssue.objects.create(parser="entrances", message=message, url=f"/cave/{slug}/edit/")
         print(message)
-        return
         
     lastvisit = getXML(entrancecontents, "last visit date", maxItems=1, minItems=0, context=context)
 
@@ -376,64 +387,39 @@ def readentrance(filename, ent=None):
     underground_description = getXMLmax1("underground_description")
     url = getXMLmax1("url")
 
-    if ent:
-        ent.name=name[0]
-        ent.non_public=boolify(non_public)
-        ent.alt=alt[0]
-        ent.approach=approach[0]
-        ent.bearings=bearings[0]
-        ent.easting=easting[0]
-        ent.entrance_description=entrance_description[0]
-        ent.exact_station=exact_station[0]
-        ent.explorers=explorers[0]
-        ent.filename=filename
-        ent.findability=findability[0]
-        ent.findability_description=findability_description[0]
-        ent.lastvisit=lastvisit[0]
-        ent.location_description=location_description[0]
-        ent.map_description=map_description[0]
-        ent.marking=marking[0]
-        ent.marking_comment=marking_comment[0]
-        ent.northing=northing[0]
-        ent.other_description=other_description[0]
-        ent.other_station=other_station[0]
-        ent.photo=photo[0]
-        ent.slug=slugs[0]
-        ent.tag_station=tag_station[0]
-        ent.underground_description=underground_description[0]
-        ent.url=url[0]
-        ent.save()
-    else:
-        e, state = Entrance.objects.update_or_create(
-            name=name[0],
-            non_public=boolify(non_public),
-            alt=alt[0],
-            approach=approach[0],
-            bearings=bearings[0],
-            easting=easting[0],
-            entrance_description=entrance_description[0],
-            exact_station=exact_station[0],
-            explorers=explorers[0],
-            filename=filename,
-            findability=findability[0],
-            findability_description=findability_description[0],
-            lastvisit=lastvisit[0],
-            location_description=location_description[0],
-            map_description=map_description[0],
-            marking=marking[0],
-            marking_comment=marking_comment[0],
-            northing=northing[0],
-            other_description=other_description[0],
-            other_station=other_station[0],
-            photo=photo[0],
-            slug=slugs[0],
-            tag_station=tag_station[0],
-            underground_description=underground_description[0],
-            url=url[0],
-        )
-        e.save()
+    if not ent:
+        ent, state = Entrance.objects.update_or_create(slug=slug)
+        
+    ent.name=name[0]
+    ent.non_public=boolify(non_public)
+    ent.alt=alt[0]
+    ent.approach=approach[0]
+    ent.bearings=bearings[0]
+    ent.easting=easting[0]
+    ent.entrance_description=entrance_description[0]
+    ent.exact_station=exact_station[0]
+    ent.explorers=explorers[0]
+    ent.filename=filename
+    ent.findability=findability[0]
+    ent.findability_description=findability_description[0]
+    ent.lastvisit=lastvisit[0]
+    ent.location_description=location_description[0]
+    ent.map_description=map_description[0]
+    ent.marking=marking[0]
+    ent.marking_comment=marking_comment[0]
+    ent.northing=northing[0]
+    ent.other_description=other_description[0]
+    ent.other_station=other_station[0]
+    ent.photo=photo[0]
+    # ent.slug=slugs[0]
+    ent.tag_station=tag_station[0]
+    ent.underground_description=underground_description[0]
+    ent.url=url[0]
+    
+    ent.save()
+    return ent
 
-def readcave(filename, cave=None):
+def read_cave(filename, cave=None):
     """Reads an entrance description from the .html file
     Convoluted. Sorry. Needs rewriting
     Assumes any area it hasn't seen before is a subarea of 1623
@@ -441,9 +427,13 @@ def readcave(filename, cave=None):
     If not called as part of initial import, then the global lists will not be correct
     but this is OK, a search will find them in the db.
     """
+    def getXMLmax1(field):
+        return getXML(cavecontents, field, maxItems=1, context=context)
+        
     def do_entrances():
         """For both bulk import and individual re-reading of cave_data file,
         fix the entrances
+        What is Class CaveAndEntrance for?
         """
         for e in entrances:
             eslug = getXML(e, "entranceslug", maxItems=1, context=context)[0]
@@ -458,11 +448,11 @@ def readcave(filename, cave=None):
                         entrance = Entrance.objects.get(slug=eslug)
                         entrances_xslug[eslug] = entrance
                     CaveAndEntrance.objects.update_or_create(
-                        cave=c, entrance_letter=letter, entrance=entrance
+                        cave=cave, entrance_letter=letter, entrance=entrance
                     )
                 except:
-                    message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
-                    DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
+                    message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{cave}" filename:"cave_data/{filename}"'
+                    DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}_edit/")
                     print(message)        
     def reload_entrances():
         """For individual re-reading of a cave_data file when editing,
@@ -470,155 +460,92 @@ def readcave(filename, cave=None):
         """
         for eslug in entrances_xslug:
             entrance = entrances_xslug[eslug]
-            readentrance(entrance.filename, ent=entrance)
+            read_entrance(entrance.filename, ent=entrance)
             entrance.save()
-                    
+
+    def do_caveslugstuff():
+        """This may be a fossil. We only have one slug per cave in troggle.
+        Pending destruction of this whole concept and Class CaveSlug
+        What is Class CaveSlug for?
+        """
+        primary = True # this sets the first thing we find to be primary=True and all the others =False
+        for slug in slugs:
+            if slug in caves_xslug:
+                cs = caves_xslug[slug]
+            else:
+               try:  
+                    cs = CaveSlug.objects.update_or_create(cave=cave, slug=slug, primary=primary)
+                    caves_xslug[slug] = cs
+               except Exception as ex:
+                    #raise
+                    # This fails to do an update! It just crashes.. to be fixed
+                    message = f" ! CaveSlug update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
+                    DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
+                    print(message)
+            primary = False
+            
     global entrances_xslug
     global caves_xslug
     global areas_xslug
 
+    # Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
+    
     # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
     fn = settings.CAVEDESCRIPTIONS / filename
     # print(f" - Reading Cave from cave descriptions file {fn}")
     if not fn.exists():
         message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
-        DataIssue.objects.create(parser="caves", message=message, url=None)
+        DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
         print(message)
-        return
+        return None
 
     with open(fn) as f:
         contents = f.read()
     context = filename
     cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
+
     if len(cavecontentslist) != 1:
-        message = f'! BAD CAVE at "{filename}"'
-        DataIssue.objects.create(parser="caves", message=message)
+        message = f'! BAD CAVE DATA in "{filename}". More than one cave. Edit file manually, click.'
+        DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
         print(message)
-        return
+        return None
         
     cavecontents = cavecontentslist[0]
-    non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
     slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
-    official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
-    areas = getXML(cavecontents, "area", context=context)
-    kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
-    kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
-    unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
-    explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
-    underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
-    equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
-    references = getXML(cavecontents, "references", maxItems=1, context=context)
-    survey = getXML(cavecontents, "survey", maxItems=1, context=context)
-    kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
-    underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
-    notes = getXML(cavecontents, "notes", maxItems=1, context=context)
-    length = getXML(cavecontents, "length", maxItems=1, context=context)
-    depth = getXML(cavecontents, "depth", maxItems=1, context=context)
-    extent = getXML(cavecontents, "extent", maxItems=1, context=context)
-    survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
-    description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
-    url = getXML(cavecontents, "url", maxItems=1, context=context)
-    entrances = getXML(cavecontents, "entrance", context=context)
-
-    if not (
-        len(non_public) == 1
-        and len(slugs) >= 1 # is this really correct ?
-        and len(official_name) == 1
-        and len(areas) >= 1 # we want to stop using the sub-ares in 2023
-        and len(kataster_code) == 1
-        and len(kataster_number) == 1
-        and len(unofficial_number) == 1
-        and len(explorers) == 1
-        and len(underground_description) == 1
-        and len(equipment) == 1
-        and len(references) == 1
-        and len(survey) == 1
-        and len(kataster_status) == 1
-        and len(underground_centre_line) == 1
-        and len(notes) == 1
-        and len(length) == 1
-        and len(depth) == 1
-        and len(extent) == 1
-        and len(survex_file) == 1
-        and len(description_file) == 1
-        and len(url) == 1
-    ):
-        # more than one item in long list
-        message = f' ! ABORT loading this cave. in "{filename}"'
-        DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/")
+    if len(slugs) > 1:
+        message = f" ! - More than one slug for a cave: {cave}, slugs: {slugs}. Ignoring all except first."
+        DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
         print(message)
-        return
-        
-    if cave:
-        # this a re-load prior to editing and we already know the cave id
-        cave.non_public=boolify(non_public)
-        cave.official_name=official_name[0]
-        cave.kataster_code=kataster_code[0]
-        cave.kataster_number=kataster_number[0]
-        cave.unofficial_number=unofficial_number[0]
-        cave.explorers=explorers[0]
-        cave.underground_description=underground_description[0]
-        cave.equipment=equipment[0]
-        cave.references=references[0]
-        cave.survey=survey[0]
-        cave.kataster_status=kataster_status[0]
-        cave.underground_centre_line=underground_centre_line[0]
-        cave.notes=notes[0]
-        cave.length=length[0]
-        cave.depth=depth[0]
-        cave.extent=extent[0]
-        cave.survex_file=survex_file[0]
-        cave.description_file=description_file[0]
-        cave.url=url[0]
-                
-        if len(slugs) > 1:
-            message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. "
-            DataIssue.objects.create(parser="caves", message=message)
-            print(message)
+    slug = slugs[0]
 
-        cave.areas = None
-        cave.save()
-        for area_slug in areas:
-            a = Area.objects.filter(short_name=area_slug)
-            if a:
-                cave.area.add(a[0]) 
-            else:
-                message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. "
-                DataIssue.objects.create(parser="caves", message=message)
-                print(message)
-            
-        c = cave
-        do_entrances()
-        print(f"- {entrances_xslug=}")
-        reload_entrances()
-        cave.save()
-    else:
+    non_public = getXMLmax1("non_public")
+    official_name = getXMLmax1("official_name")
+    kataster_code = getXMLmax1("kataster_code")
+    kataster_number = getXMLmax1("kataster_number")
+    unofficial_number = getXMLmax1("unofficial_number")
+    explorers = getXMLmax1("explorers")
+    underground_description = getXMLmax1("underground_description")
+    equipment = getXMLmax1("equipment")
+    references = getXMLmax1("references")
+    survey = getXMLmax1("survey")
+    kataster_status = getXMLmax1("kataster_status")
+    underground_centre_line = getXMLmax1("underground_centre_line")
+    notes = getXMLmax1("notes")
+    length = getXMLmax1("length")
+    depth = getXMLmax1("depth")
+    extent = getXMLmax1("extent")
+    survex_file = getXMLmax1("survex_file")
+    description_file = getXMLmax1("description_file")
+    url = getXMLmax1("url")
+    
+    manual_edit = True
+    if not cave:
+        manual_edit = False
         try:
-            c, state = Cave.objects.update_or_create(
-                non_public=boolify(non_public),
-                official_name=official_name[0],
-                kataster_code=kataster_code[0],
-                kataster_number=kataster_number[0],
-                unofficial_number=unofficial_number[0],
-                explorers=explorers[0],
-                underground_description=underground_description[0],
-                equipment=equipment[0],
-                references=references[0],
-                survey=survey[0],
-                kataster_status=kataster_status[0],
-                underground_centre_line=underground_centre_line[0],
-                notes=notes[0],
-                length=length[0],
-                depth=depth[0],
-                extent=extent[0],
-                survex_file=survex_file[0],
-                description_file=description_file[0],
-                url=url[0],
-                filename=filename,
-            )
+            cave, state = Cave.objects.update_or_create(filename=filename) # replace with slug when CaveSlug tidied up
         except:
-            print(" ! FAILED to get only one CAVE when updating using: " + filename)
-            kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
+            print(" ! FAILED to get only one CAVE in db when updating using: " + filename)
+            kaves = Cave.objects.all().filter(filename=filename) # replace with slug when CaveSlug tidied up
             for k in kaves:
                 message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
                 DataIssue.objects.create(parser="caves", message=message)
@@ -627,45 +554,63 @@ def readcave(filename, cave=None):
                 if k.slug() is not None:
                     print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
                     k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
-                    c = k
-                    
-        for area_slug in areas:
-            if area_slug in areas_xslug:
-                newArea = areas_xslug[area_slug]
-            else:
-                area = Area.objects.filter(short_name=area_slug)
-                if area:
-                    newArea = area[0]
-                else:
-                    newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
-                    newArea.save()
-                areas_xslug[area_slug] = newArea
-            c.area.add(newArea)
+                    cave = k
+
+    # From here on the code applies to both edited and newly-imported caves (mostly!)
+    do_caveslugstuff() # needs cave!=None
+    
+    cave.non_public=boolify(non_public)
+    cave.official_name=official_name[0]
+    cave.kataster_code=kataster_code[0]
+    cave.kataster_number=kataster_number[0]
+    cave.unofficial_number=unofficial_number[0]
+    cave.explorers=explorers[0]
+    cave.underground_description=underground_description[0]
+    cave.equipment=equipment[0]
+    cave.references=references[0]
+    cave.survey=survey[0]
+    cave.kataster_status=kataster_status[0]
+    cave.underground_centre_line=underground_centre_line[0]
+    cave.notes=notes[0]
+    cave.length=length[0]
+    cave.depth=depth[0]
+    cave.extent=extent[0]
+    cave.survex_file=survex_file[0]
+    cave.description_file=description_file[0]
+    cave.url=url[0]
             
-        primary = True # this sets the first thing we find to be primary=True and all the others =False
-        for slug in slugs:
-            if slug in caves_xslug:
-                cs = caves_xslug[slug]
+    areas = getXML(cavecontents, "area", context=context)
+    # cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING
+    for area_slug in areas:
+        if area_slug in areas_xslug:
+            newArea = areas_xslug[area_slug]
+        else:
+            areas_new = Area.objects.filter(short_name=area_slug)
+            if areas_new:
+                newArea = areas_new[0] # just the first one we find, but we are going to clean up Areas anyway
             else:
-               try:  # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
-                    cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
-                    caves_xslug[slug] = cs
-               except Exception as ex:
-                    #raise
-                    # This fails to do an update! It just crashes.. to be fixed
-                    message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
+                # Area not seen before. SHould not happen with manual edit
+                if manual_edit:
+                    message = f" ! Cave edit failure due to unrecognised Area: {area_slug[0]}, skipping this field edit. "
                     DataIssue.objects.create(parser="caves", message=message)
-                    print(message)
-
-            primary = False
+                    print(message)                    
+                # super value is highly dodgy
+                newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
+                newArea.save()
+            areas_xslug[area_slug] = newArea
+        cave.area.add(newArea)
 
-        if not entrances or len(entrances) < 1:
-            # missing entrance link in cave_data/1623-* .html file
-            set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
-        else:
-            do_entrances()
-
-    # From here on the code applies to both edited and newly-imported caves
+    entrances = getXML(cavecontents, "entrance", context=context)
+    do_entrances()
+    # print(f"- {entrances_xslug=}")
+    if not entrances or len(entrances) < 1:
+        # missing entrance link in cave_data/1623-* .html file
+        set_dummy_entrance(slug[5:], slug, cave, msg="DUMMY: no entrances")
+    else:
+        do_entrances()
+    if manual_edit:
+        reload_entrances()
+        
     if survex_file[0]:
         if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
             message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
@@ -681,8 +626,9 @@ def readcave(filename, cave=None):
             message = f' ! {slug:12} description filename  "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
             DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
             print(message)
-            # c.description_file="" # done only once, to clear out cruft.
-    c.save()
+
+    cave.save()
+    return cave
 
 
 # ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
@@ -695,7 +641,7 @@ def readcave(filename, cave=None):
 # class FailedCaveUpdateException(Exception):
     # pass
 
-# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
+# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
     # """Reads an entrance description from the .html file and updates the corresponding Cave object"""
     # tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
     # root = tree.getroot()
@@ -789,17 +735,12 @@ def readcaves():
         print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
         print(" - Reading Entrances from entrance descriptions xml files")
         for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]:  # Should be a better way of getting a list of files
-            # if filename.endswith('.html'):
-            # if Path(filename).stem[5:] in pending:
-            # print(f'Skipping pending entrance dummy file <{filename}>')
-            # else:
-            # readentrance(filename)
-            readentrance(filename)
+            read_entrance(filename)
 
         print(" - Reading Caves from cave descriptions xml files")
         for filename in next(os.walk(CAVEDESCRIPTIONS))[2]:  # Should be a better way of getting a list of files
             if filename.endswith(".html"):
-                readcave(filename)
+                read_cave(filename)
 
     print(" - Setting up all the variously useful alias names")
     GetCaveLookup()