summaryrefslogtreecommitdiffstats
path: root/parsers/caves.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/caves.py')
-rw-r--r--parsers/caves.py425
1 files changed, 183 insertions, 242 deletions
diff --git a/parsers/caves.py b/parsers/caves.py
index 7eba28a..5389a8e 100644
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -27,6 +27,8 @@ todo = """
- Cannot use Edit This Page for pendingcaves.txt_edit as Edit This Page is expecting an html file.
So we will need a separate file-editing capability just for this configuration file ?!
+- we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
+
- Semi-automagically import all the 1627- pending caves and create HTML files for them to be
edited individually. (These are caves we only know about because we have German survex files.)
@@ -281,6 +283,9 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
"""Reads a single XML tag
Should throw exception rather than producing error message here,
then handle exception in calling routine where it has the context.
+
+ This always succeeds, but it produices error message on the terminal and in the
+ DatIssues log.
"""
items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
if len(items) < minItems:
@@ -300,7 +305,7 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
+ " in file "
+ context
)
- DataIssue.objects.create(parser="caves", message=message)
+ DataIssue.objects.create(parser="caves", message=message, url="" + context)
print(message)
if minItems == 0:
if not items:
@@ -315,12 +320,19 @@ def boolify(boolstrs):
"true": True,
"false": False}[boolstrs[0]]
-def readentrance(filename, ent=None):
- """Reads an entrance description from the .html file
-
- If not called as part of initial import, then the global lists will not be correct
- but this is OK, a search will find them in the db.
- """
+def read_entrance(filename, ent=None):
+ """Reads an entrance description from the .html file.
+
+ If not called as part of initial import, then the global lists will not be correct
+ but this is OK, a search will find them in the db.
+
+ Args:
+ filename: The name of the .html file.
+ ent: The entrance object, if it already exists.
+
+ Returns:
+ The entrance object, or a new entrance object if `ent` is None.
+ """
def getXMLmax1(field):
return getXML(entrancecontents, field, maxItems=1, context=context)
@@ -333,23 +345,22 @@ def readentrance(filename, ent=None):
contents = f.read()
context = filename
- # print("Reading file ENTRANCE {} / {}".format(ENTRANCEDESCRIPTIONS, filename))
entrancecontentslist = getXML(contents, "entrance", maxItems=1, context=context)
if len(entrancecontentslist) != 1:
- message = f'! BAD ENTRANCE at "{filename}". Loading aborted. '
- DataIssue.objects.create(parser="entrances", message=message)
+ message = f'! BAD ENTRANCE DATA in "{filename}". More than one entrance. Edit file manually, click.'
+ DataIssue.objects.create(parser="entrances", message=message, url=f"/entrance_data/{filename}_edit")
print(message)
- return
+ return None
entrancecontents = entrancecontentslist[0]
slugs = getXML(entrancecontents, "slug", context=context)
+ slug = slugs[0]
if len(slugs) >1:
# Only ever one of these per entrance in the expo dataset
- message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Aborting."
+ message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Ignoring all except first."
DataIssue.objects.create(parser="entrances", message=message, url=f"/cave/{slug}/edit/")
print(message)
- return
lastvisit = getXML(entrancecontents, "last visit date", maxItems=1, minItems=0, context=context)
@@ -376,64 +387,39 @@ def readentrance(filename, ent=None):
underground_description = getXMLmax1("underground_description")
url = getXMLmax1("url")
- if ent:
- ent.name=name[0]
- ent.non_public=boolify(non_public)
- ent.alt=alt[0]
- ent.approach=approach[0]
- ent.bearings=bearings[0]
- ent.easting=easting[0]
- ent.entrance_description=entrance_description[0]
- ent.exact_station=exact_station[0]
- ent.explorers=explorers[0]
- ent.filename=filename
- ent.findability=findability[0]
- ent.findability_description=findability_description[0]
- ent.lastvisit=lastvisit[0]
- ent.location_description=location_description[0]
- ent.map_description=map_description[0]
- ent.marking=marking[0]
- ent.marking_comment=marking_comment[0]
- ent.northing=northing[0]
- ent.other_description=other_description[0]
- ent.other_station=other_station[0]
- ent.photo=photo[0]
- ent.slug=slugs[0]
- ent.tag_station=tag_station[0]
- ent.underground_description=underground_description[0]
- ent.url=url[0]
- ent.save()
- else:
- e, state = Entrance.objects.update_or_create(
- name=name[0],
- non_public=boolify(non_public),
- alt=alt[0],
- approach=approach[0],
- bearings=bearings[0],
- easting=easting[0],
- entrance_description=entrance_description[0],
- exact_station=exact_station[0],
- explorers=explorers[0],
- filename=filename,
- findability=findability[0],
- findability_description=findability_description[0],
- lastvisit=lastvisit[0],
- location_description=location_description[0],
- map_description=map_description[0],
- marking=marking[0],
- marking_comment=marking_comment[0],
- northing=northing[0],
- other_description=other_description[0],
- other_station=other_station[0],
- photo=photo[0],
- slug=slugs[0],
- tag_station=tag_station[0],
- underground_description=underground_description[0],
- url=url[0],
- )
- e.save()
+ if not ent:
+ ent, state = Entrance.objects.update_or_create(slug=slug)
+
+ ent.name=name[0]
+ ent.non_public=boolify(non_public)
+ ent.alt=alt[0]
+ ent.approach=approach[0]
+ ent.bearings=bearings[0]
+ ent.easting=easting[0]
+ ent.entrance_description=entrance_description[0]
+ ent.exact_station=exact_station[0]
+ ent.explorers=explorers[0]
+ ent.filename=filename
+ ent.findability=findability[0]
+ ent.findability_description=findability_description[0]
+ ent.lastvisit=lastvisit[0]
+ ent.location_description=location_description[0]
+ ent.map_description=map_description[0]
+ ent.marking=marking[0]
+ ent.marking_comment=marking_comment[0]
+ ent.northing=northing[0]
+ ent.other_description=other_description[0]
+ ent.other_station=other_station[0]
+ ent.photo=photo[0]
+ # ent.slug=slugs[0]
+ ent.tag_station=tag_station[0]
+ ent.underground_description=underground_description[0]
+ ent.url=url[0]
+
+ ent.save()
+ return ent
-def readcave(filename, cave=None):
+def read_cave(filename, cave=None):
"""Reads an entrance description from the .html file
Convoluted. Sorry. Needs rewriting
Assumes any area it hasn't seen before is a subarea of 1623
@@ -441,9 +427,13 @@ def readcave(filename, cave=None):
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
"""
+ def getXMLmax1(field):
+ return getXML(cavecontents, field, maxItems=1, context=context)
+
def do_entrances():
"""For both bulk import and individual re-reading of cave_data file,
fix the entrances
+ What is Class CaveAndEntrance for?
"""
for e in entrances:
eslug = getXML(e, "entranceslug", maxItems=1, context=context)[0]
@@ -458,11 +448,11 @@ def readcave(filename, cave=None):
entrance = Entrance.objects.get(slug=eslug)
entrances_xslug[eslug] = entrance
CaveAndEntrance.objects.update_or_create(
- cave=c, entrance_letter=letter, entrance=entrance
+ cave=cave, entrance_letter=letter, entrance=entrance
)
except:
- message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
- DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
+ message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{cave}" filename:"cave_data/{filename}"'
+ DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}_edit/")
print(message)
def reload_entrances():
"""For individual re-reading of a cave_data file when editing,
@@ -470,155 +460,92 @@ def readcave(filename, cave=None):
"""
for eslug in entrances_xslug:
entrance = entrances_xslug[eslug]
- readentrance(entrance.filename, ent=entrance)
+ read_entrance(entrance.filename, ent=entrance)
entrance.save()
-
+
+ def do_caveslugstuff():
+ """This may be a fossil. We only have one slug per cave in troggle.
+ Pending destruction of this whole concept and Class CaveSlug
+ What is Class CaveSlug for?
+ """
+ primary = True # this sets the first thing we find to be primary=True and all the others =False
+ for slug in slugs:
+ if slug in caves_xslug:
+ cs = caves_xslug[slug]
+ else:
+ try:
+ cs = CaveSlug.objects.update_or_create(cave=cave, slug=slug, primary=primary)
+ caves_xslug[slug] = cs
+ except Exception as ex:
+ #raise
+ # This fails to do an update! It just crashes.. to be fixed
+ message = f" ! CaveSlug update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
+ DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
+ print(message)
+ primary = False
+
global entrances_xslug
global caves_xslug
global areas_xslug
+ # Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
+
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
fn = settings.CAVEDESCRIPTIONS / filename
# print(f" - Reading Cave from cave descriptions file {fn}")
if not fn.exists():
message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
- DataIssue.objects.create(parser="caves", message=message, url=None)
+ DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
print(message)
- return
+ return None
with open(fn) as f:
contents = f.read()
context = filename
cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
+
if len(cavecontentslist) != 1:
- message = f'! BAD CAVE at "{filename}"'
- DataIssue.objects.create(parser="caves", message=message)
+ message = f'! BAD CAVE DATA in "{filename}". More than one cave. Edit file manually, click.'
+ DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
print(message)
- return
+ return None
cavecontents = cavecontentslist[0]
- non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
- official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
- areas = getXML(cavecontents, "area", context=context)
- kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
- kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
- unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
- explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
- underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
- equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
- references = getXML(cavecontents, "references", maxItems=1, context=context)
- survey = getXML(cavecontents, "survey", maxItems=1, context=context)
- kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
- underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
- notes = getXML(cavecontents, "notes", maxItems=1, context=context)
- length = getXML(cavecontents, "length", maxItems=1, context=context)
- depth = getXML(cavecontents, "depth", maxItems=1, context=context)
- extent = getXML(cavecontents, "extent", maxItems=1, context=context)
- survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
- description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
- url = getXML(cavecontents, "url", maxItems=1, context=context)
- entrances = getXML(cavecontents, "entrance", context=context)
-
- if not (
- len(non_public) == 1
- and len(slugs) >= 1 # is this really correct ?
- and len(official_name) == 1
- and len(areas) >= 1 # we want to stop using the sub-ares in 2023
- and len(kataster_code) == 1
- and len(kataster_number) == 1
- and len(unofficial_number) == 1
- and len(explorers) == 1
- and len(underground_description) == 1
- and len(equipment) == 1
- and len(references) == 1
- and len(survey) == 1
- and len(kataster_status) == 1
- and len(underground_centre_line) == 1
- and len(notes) == 1
- and len(length) == 1
- and len(depth) == 1
- and len(extent) == 1
- and len(survex_file) == 1
- and len(description_file) == 1
- and len(url) == 1
- ):
- # more than one item in long list
- message = f' ! ABORT loading this cave. in "{filename}"'
- DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/")
+ if len(slugs) > 1:
+ message = f" ! - More than one slug for a cave: {cave}, slugs: {slugs}. Ignoring all except first."
+ DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
print(message)
- return
-
- if cave:
- # this a re-load prior to editing and we already know the cave id
- cave.non_public=boolify(non_public)
- cave.official_name=official_name[0]
- cave.kataster_code=kataster_code[0]
- cave.kataster_number=kataster_number[0]
- cave.unofficial_number=unofficial_number[0]
- cave.explorers=explorers[0]
- cave.underground_description=underground_description[0]
- cave.equipment=equipment[0]
- cave.references=references[0]
- cave.survey=survey[0]
- cave.kataster_status=kataster_status[0]
- cave.underground_centre_line=underground_centre_line[0]
- cave.notes=notes[0]
- cave.length=length[0]
- cave.depth=depth[0]
- cave.extent=extent[0]
- cave.survex_file=survex_file[0]
- cave.description_file=description_file[0]
- cave.url=url[0]
-
- if len(slugs) > 1:
- message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. "
- DataIssue.objects.create(parser="caves", message=message)
- print(message)
+ slug = slugs[0]
- cave.areas = None
- cave.save()
- for area_slug in areas:
- a = Area.objects.filter(short_name=area_slug)
- if a:
- cave.area.add(a[0])
- else:
- message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. "
- DataIssue.objects.create(parser="caves", message=message)
- print(message)
-
- c = cave
- do_entrances()
- print(f"- {entrances_xslug=}")
- reload_entrances()
- cave.save()
- else:
+ non_public = getXMLmax1("non_public")
+ official_name = getXMLmax1("official_name")
+ kataster_code = getXMLmax1("kataster_code")
+ kataster_number = getXMLmax1("kataster_number")
+ unofficial_number = getXMLmax1("unofficial_number")
+ explorers = getXMLmax1("explorers")
+ underground_description = getXMLmax1("underground_description")
+ equipment = getXMLmax1("equipment")
+ references = getXMLmax1("references")
+ survey = getXMLmax1("survey")
+ kataster_status = getXMLmax1("kataster_status")
+ underground_centre_line = getXMLmax1("underground_centre_line")
+ notes = getXMLmax1("notes")
+ length = getXMLmax1("length")
+ depth = getXMLmax1("depth")
+ extent = getXMLmax1("extent")
+ survex_file = getXMLmax1("survex_file")
+ description_file = getXMLmax1("description_file")
+ url = getXMLmax1("url")
+
+ manual_edit = True
+ if not cave:
+ manual_edit = False
try:
- c, state = Cave.objects.update_or_create(
- non_public=boolify(non_public),
- official_name=official_name[0],
- kataster_code=kataster_code[0],
- kataster_number=kataster_number[0],
- unofficial_number=unofficial_number[0],
- explorers=explorers[0],
- underground_description=underground_description[0],
- equipment=equipment[0],
- references=references[0],
- survey=survey[0],
- kataster_status=kataster_status[0],
- underground_centre_line=underground_centre_line[0],
- notes=notes[0],
- length=length[0],
- depth=depth[0],
- extent=extent[0],
- survex_file=survex_file[0],
- description_file=description_file[0],
- url=url[0],
- filename=filename,
- )
+ cave, state = Cave.objects.update_or_create(filename=filename) # replace with slug when CaveSlug tidied up
except:
- print(" ! FAILED to get only one CAVE when updating using: " + filename)
- kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
+ print(" ! FAILED to get only one CAVE in db when updating using: " + filename)
+ kaves = Cave.objects.all().filter(filename=filename) # replace with slug when CaveSlug tidied up
for k in kaves:
message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
DataIssue.objects.create(parser="caves", message=message)
@@ -627,45 +554,63 @@ def readcave(filename, cave=None):
if k.slug() is not None:
print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
- c = k
-
- for area_slug in areas:
- if area_slug in areas_xslug:
- newArea = areas_xslug[area_slug]
- else:
- area = Area.objects.filter(short_name=area_slug)
- if area:
- newArea = area[0]
- else:
- newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
- newArea.save()
- areas_xslug[area_slug] = newArea
- c.area.add(newArea)
+ cave = k
+
+ # From here on the code applies to both edited and newly-imported caves (mostly!)
+ do_caveslugstuff() # needs cave!=None
+
+ cave.non_public=boolify(non_public)
+ cave.official_name=official_name[0]
+ cave.kataster_code=kataster_code[0]
+ cave.kataster_number=kataster_number[0]
+ cave.unofficial_number=unofficial_number[0]
+ cave.explorers=explorers[0]
+ cave.underground_description=underground_description[0]
+ cave.equipment=equipment[0]
+ cave.references=references[0]
+ cave.survey=survey[0]
+ cave.kataster_status=kataster_status[0]
+ cave.underground_centre_line=underground_centre_line[0]
+ cave.notes=notes[0]
+ cave.length=length[0]
+ cave.depth=depth[0]
+ cave.extent=extent[0]
+ cave.survex_file=survex_file[0]
+ cave.description_file=description_file[0]
+ cave.url=url[0]
- primary = True # this sets the first thing we find to be primary=True and all the others =False
- for slug in slugs:
- if slug in caves_xslug:
- cs = caves_xslug[slug]
+ areas = getXML(cavecontents, "area", context=context)
+ # cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING
+ for area_slug in areas:
+ if area_slug in areas_xslug:
+ newArea = areas_xslug[area_slug]
+ else:
+ areas_new = Area.objects.filter(short_name=area_slug)
+ if areas_new:
+ newArea = areas_new[0] # just the first one we find, but we are going to clean up Areas anyway
else:
- try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
- cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
- caves_xslug[slug] = cs
- except Exception as ex:
- #raise
- # This fails to do an update! It just crashes.. to be fixed
- message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
+ # Area not seen before. SHould not happen with manual edit
+ if manual_edit:
+ message = f" ! Cave edit failure due to unrecognised Area: {area_slug[0]}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
- print(message)
-
- primary = False
+ print(message)
+ # super value is highly dodgy
+ newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
+ newArea.save()
+ areas_xslug[area_slug] = newArea
+ cave.area.add(newArea)
- if not entrances or len(entrances) < 1:
- # missing entrance link in cave_data/1623-* .html file
- set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
- else:
- do_entrances()
-
- # From here on the code applies to both edited and newly-imported caves
+ entrances = getXML(cavecontents, "entrance", context=context)
+ do_entrances()
+ # print(f"- {entrances_xslug=}")
+ if not entrances or len(entrances) < 1:
+ # missing entrance link in cave_data/1623-* .html file
+ set_dummy_entrance(slug[5:], slug, cave, msg="DUMMY: no entrances")
+ else:
+ do_entrances()
+ if manual_edit:
+ reload_entrances()
+
if survex_file[0]:
if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
@@ -681,8 +626,9 @@ def readcave(filename, cave=None):
message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
print(message)
- # c.description_file="" # done only once, to clear out cruft.
- c.save()
+
+ cave.save()
+ return cave
# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
@@ -695,7 +641,7 @@ def readcave(filename, cave=None):
# class FailedCaveUpdateException(Exception):
# pass
-# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
+# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
# """Reads an entrance description from the .html file and updates the corresponding Cave object"""
# tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
# root = tree.getroot()
@@ -789,17 +735,12 @@ def readcaves():
print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
print(" - Reading Entrances from entrance descriptions xml files")
for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
- # if filename.endswith('.html'):
- # if Path(filename).stem[5:] in pending:
- # print(f'Skipping pending entrance dummy file <{filename}>')
- # else:
- # readentrance(filename)
- readentrance(filename)
+ read_entrance(filename)
print(" - Reading Caves from cave descriptions xml files")
for filename in next(os.walk(CAVEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
if filename.endswith(".html"):
- readcave(filename)
+ read_cave(filename)
print(" - Setting up all the variously useful alias names")
GetCaveLookup()