summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'parsers')
-rw-r--r--parsers/imports.py2
-rw-r--r--parsers/people.py75
-rw-r--r--parsers/survex.py7
3 files changed, 60 insertions, 24 deletions
diff --git a/parsers/imports.py b/parsers/imports.py
index bfafbfe..01c1d7b 100644
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -27,7 +27,7 @@ def import_people():
print("-- Importing People (folk.csv) to ",end="")
print(django.db.connections.databases['default']['NAME'])
with transaction.atomic():
- troggle.parsers.people.LoadPersonsExpos()
+ troggle.parsers.people.load_people_expos()
def import_surveyscans():
print("-- Importing Survey Scans")
diff --git a/parsers/people.py b/parsers/people.py
index 345210c..3f7c02a 100644
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -1,11 +1,13 @@
import csv, re, datetime, os, shutil
from html.parser import HTMLParser
from unidecode import unidecode
+from pathlib import Path
from django.conf import settings
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
-from troggle.core.utils import save_carefully
+from troggle.core.models.troggle import DataIssue
+from troggle.core.utils import save_carefully, TROG
'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
href links to pages in troggle which troggle does not think are right.
@@ -13,32 +15,59 @@ The standalone script needs to be renedred defucnt, and all the parsing needs to
or they should use the same code by importing a module.
'''
-def parseMugShotAndBlurb(personline, header, person):
+def parse_blurb(personline, header, person):
"""create mugshot Photo instance"""
- mugShotFilename=personline[header["Mugshot"]]
- mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
- if mugShotPath[-3:]=='jpg': #if person just has an image, add it
- #saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
+ ms_filename = personline[header["Mugshot"]]
+ ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
+
+ if ms_filename:
+ if not ms_path.is_file():
+ message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
+ print(message)
+ DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
+ return
+
+ if ms_filename.startswith('i/'):
+ #if person just has an image, add it. It has format 'i/adama2018.jpg'
+ person.mug_shot = str(Path("/folk", ms_filename))
+ person.blurb = None
+
+ elif ms_filename.startswith('l/'):
+ # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
+ with open(ms_path,'r') as blurbfile:
+ blrb = blurbfile.read()
+ pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
+ if pblurb:
+ person.mug_shot = None
+ fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1)
+ fragment = fragment.replace('src="../i/', 'src="/folk/i/')
+ fragment = fragment.replace("src='../i/", "src='/folk/i/")
+ fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
+ # replace src="../i/ with src="/folk/i
+ person.blurb = fragment
+ else:
+ message = f"! Blurb parse error in {ms_filename}"
+ print(message)
+ DataIssue.objects.create(parser='people', message=message, url="/folk/")
+
+ elif ms_filename == '':
pass
- elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
- personPageOld=open(mugShotPath,'r').read()
- if not person.blurb:
- pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
- if pblurb:
- #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
- #Only finds the first image, not all of them
- person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group()
- else:
- print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
- #for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
- # mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
- # saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
+ else:
+ message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
+ print(message)
+ DataIssue.objects.create(parser='people', message=message, url="/folk/")
+
person.save()
-def LoadPersonsExpos():
+def load_people_expos():
+ '''This is where the folk.csv file is parsed to read people's names.
+ Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
+ and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
+ '''
+ DataIssue.objects.filter(parser='people').delete()
- persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
- personreader = csv.reader(persontab)
+ persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
+ personreader = csv.reader(persontab) # this is an iterator
headers = next(personreader)
header = dict(list(zip(headers, list(range(len(headers))))))
@@ -86,7 +115,7 @@ def LoadPersonsExpos():
nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)
- parseMugShotAndBlurb(personline=personline, header=header, person=person)
+ parse_blurb(personline=personline, header=header, person=person)
# make person expedition from table
for year, attended in list(zip(headers, personline))[5:]:
diff --git a/parsers/survex.py b/parsers/survex.py
index 23f27aa..5938615 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -576,6 +576,7 @@ class LoadingSurvex():
return self.caveslist[cavepath.lower()]
# TO DO - some of this is already done in generating self.caveslist so simplify this
# esp. as it is in a loop.
+ # TO DO recognise cave if different name, e.g. gruenstein == 281
path_match = self.rx_cave.search(cavepath)
if path_match:
sluggy = '{}-{}'.format(path_match.group(1), path_match.group(2))
@@ -608,9 +609,15 @@ class LoadingSurvex():
"""Ignore surface, kataser and gps *include survex files
"""
if headpath in self.ignorenoncave:
+ #message = f" - {headpath} is <ignorenoncave> (while creating '{includelabel}' sfile & sdirectory)"
+ #print("\n"+message)
+ #print("\n"+message,file=sys.stderr)
return
for i in self.ignoreprefix:
if headpath.startswith(i):
+ #message = f" - {headpath} starts with <ignoreprefix> (while creating '{includelabel}' sfile & sdirectory)"
+ #print("\n"+message)
+ #print("\n"+message,file=sys.stderr)
return
message = " ! {} is not a cave. (while creating '{}' sfile & sdirectory)".format(headpath, includelabel)
print("\n"+message)