summaryrefslogtreecommitdiffstats
path: root/parsers/caves.py
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@gmail.com>2023-04-22 23:15:50 +0100
committerPhilip Sargent <philip.sargent@gmail.com>2023-04-22 23:15:50 +0100
commitc5a9bdc7248368d6ec8baa0f2061a3512ae7aaba (patch)
treec82108b2731bb37bddac5fcdc8861664efddc5bf /parsers/caves.py
parent30ef427b904f2f1fcf3635d5944b14615b48dcee (diff)
downloadtroggle-c5a9bdc7248368d6ec8baa0f2061a3512ae7aaba.tar.gz
troggle-c5a9bdc7248368d6ec8baa0f2061a3512ae7aaba.tar.bz2
troggle-c5a9bdc7248368d6ec8baa0f2061a3512ae7aaba.zip
xml parser attmpt retract
Diffstat (limited to 'parsers/caves.py')
-rw-r--r--parsers/caves.py80
1 files changed, 13 insertions, 67 deletions
diff --git a/parsers/caves.py b/parsers/caves.py
index 5389a8e..e1ec8dc 100644
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -1,5 +1,7 @@
import os
import re
+
+
from pathlib import Path
from django.conf import settings
@@ -426,6 +428,14 @@ def read_cave(filename, cave=None):
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
+
+ Attempted to use standard python3.11 xml library but fails on HTML entities (2023-04-23)
+ import xml.etree.ElementTree as ET
+ tree = ET.parse(fn)
+ xml_root = tree.getroot()
+ for t in ["html", "head", "body", "cave","non_public", "caveslug", "official_name","entrance"]:
+ elements = xml_root.findall(t)
+
"""
def getXMLmax1(field):
return getXML(cavecontents, field, maxItems=1, context=context)
@@ -492,6 +502,8 @@ def read_cave(filename, cave=None):
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
fn = settings.CAVEDESCRIPTIONS / filename
+ context = filename
+
# print(f" - Reading Cave from cave descriptions file {fn}")
if not fn.exists():
message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
@@ -501,7 +513,6 @@ def read_cave(filename, cave=None):
with open(fn) as f:
contents = f.read()
- context = filename
cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
if len(cavecontentslist) != 1:
@@ -580,7 +591,7 @@ def read_cave(filename, cave=None):
cave.url=url[0]
areas = getXML(cavecontents, "area", context=context)
- # cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING
+ cave.area.clear() # Deletes all links to areas in db
for area_slug in areas:
if area_slug in areas_xslug:
newArea = areas_xslug[area_slug]
@@ -630,70 +641,6 @@ def read_cave(filename, cave=None):
cave.save()
return cave
-
-# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
-# import os
-# import xml.etree.ElementTree as ET
-
-# class BadCaveException(Exception):
- # pass
-
-# class FailedCaveUpdateException(Exception):
- # pass
-
-# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
- # """Reads an entrance description from the .html file and updates the corresponding Cave object"""
- # tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
- # root = tree.getroot()
-
- # cavecontents = root.find("cave")
- # if cavecontents is None:
- # raise BadCaveException(f'! BAD CAVE at "{filename}"')
-
- # non_public = cavecontents.findtext("non_public")
- # slugs = cavecontents.findtext("caveslug")
- # official_name = cavecontents.findtext("official_name")
- # kataster_code = cavecontents.findtext("kataster_code")
- # kataster_number = cavecontents.findtext("kataster_number")
- # unofficial_number = cavecontents.findtext("unofficial_number")
- # explorers = cavecontents.findtext("explorers")
- # underground_description = cavecontents.findtext("underground_description")
- # equipment = cavecontents.findtext("equipment")
- # references = cavecontents.findtext("references")
- # survey = cavecontents.findtext("survey")
- # kataster_status = cavecontents.findtext("kataster_status")
- # underground_centre_line = cavecontents.findtext("underground_centre_line")
- # notes = cavecontents.findtext("notes")
- # length = cavecontents.findtext("length")
- # depth = cavecontents.findtext("depth")
- # extent = cavecontents.findtext("extent")
- # survex_file = cavecontents.findtext("survex_file")
- # description_file = cavecontents.findtext("description_file")
- # url = cavecontents.findtext("url")
-
- # areas = cavecontents.findall("area")
- # entrances = cavecontents.findall("entrance")
-
- # if (
- # non_public is not None
- # # etc.
- # # wrong, some of these should be ==1 and some >=1
- # ):
- # try:
- # cave = caves_xslug.get(kataster_number)
- # if cave is None:
- # cave = Cave.objects.create(
- # non_public={
- # "True": True,
- # "False": False,
- # "true": True,
- # "false": False,
- # }[non_public],
- # official_name=official_name,
- # # kataster [truncated]
-
-
-
def readcaves():
"""Called from databaseReset mass importer.
Reads the xml-format HTML 'cave' files in the EXPOWEB repo, the survex files from the loser repo.
@@ -732,7 +679,6 @@ def readcaves():
with transaction.atomic():
area = get_area("1623")
- print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
print(" - Reading Entrances from entrance descriptions xml files")
for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
read_entrance(filename)