diff options
author | Philip Sargent <philip.sargent@klebos.com> | 2021-04-15 17:51:01 +0100 |
---|---|---|
committer | Philip Sargent <philip.sargent@klebos.com> | 2021-04-15 17:51:01 +0100 |
commit | 27491c933a3b676960179448c7c28ba1b788e3e7 (patch) | |
tree | 61f90f2b20482f17ddaded9b6a7e335be1f0b7be /parsers/people.py | |
parent | 7124d978d31b839efc57112a9aa2c6d82c2d60d7 (diff) | |
download | troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.gz troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.bz2 troggle-27491c933a3b676960179448c7c28ba1b788e3e7.zip |
enabled mugshots & blurb in people pages
Diffstat (limited to 'parsers/people.py')
-rw-r--r-- | parsers/people.py | 75 |
1 files changed, 52 insertions, 23 deletions
diff --git a/parsers/people.py b/parsers/people.py index 345210c..3f7c02a 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -1,11 +1,13 @@ import csv, re, datetime, os, shutil from html.parser import HTMLParser from unidecode import unidecode +from pathlib import Path from django.conf import settings from troggle.core.models.troggle import Expedition, Person, PersonExpedition -from troggle.core.utils import save_carefully +from troggle.core.models.troggle import DataIssue +from troggle.core.utils import save_carefully, TROG '''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has href links to pages in troggle which troggle does not think are right. @@ -13,32 +15,59 @@ The standalone script needs to be renedred defucnt, and all the parsing needs to or they should use the same code by importing a module. ''' -def parseMugShotAndBlurb(personline, header, person): +def parse_blurb(personline, header, person): """create mugshot Photo instance""" - mugShotFilename=personline[header["Mugshot"]] - mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename) - if mugShotPath[-3:]=='jpg': #if person just has an image, add it - #saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person) + ms_filename = personline[header["Mugshot"]] + ms_path = Path(settings.EXPOWEB, "folk", ms_filename) + + if ms_filename: + if not ms_path.is_file(): + message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}" + print(message) + DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}") + return + + if ms_filename.startswith('i/'): + #if person just has an image, add it. It has format 'i/adama2018.jpg' + person.mug_shot = str(Path("/folk", ms_filename)) + person.blurb = None + + elif ms_filename.startswith('l/'): + # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images + with open(ms_path,'r') as blurbfile: + blrb = blurbfile.read() + pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL) + if pblurb: + person.mug_shot = None + fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1) + fragment = fragment.replace('src="../i/', 'src="/folk/i/') + fragment = fragment.replace("src='../i/", "src='/folk/i/") + fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment) + # replace src="../i/ with src="/folk/i + person.blurb = fragment + else: + message = f"! Blurb parse error in {ms_filename}" + print(message) + DataIssue.objects.create(parser='people', message=message, url="/folk/") + + elif ms_filename == '': pass - elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance. - personPageOld=open(mugShotPath,'r').read() - if not person.blurb: - pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL) - if pblurb: - #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb. - #Only finds the first image, not all of them - person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group() - else: - print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename) - #for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL): - # mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename) - # saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person) + else: + message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}" + print(message) + DataIssue.objects.create(parser='people', message=message, url="/folk/") + person.save() -def LoadPersonsExpos(): +def load_people_expos(): + '''This is where the folk.csv file is parsed to read people's names. + Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names' + and McLean and Mclean and McAdam - interaction with the url parser in urls.py too + ''' + DataIssue.objects.filter(parser='people').delete() - persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) - personreader = csv.reader(persontab) + persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess + personreader = csv.reader(persontab) # this is an iterator headers = next(personreader) header = dict(list(zip(headers, list(range(len(headers)))))) @@ -86,7 +115,7 @@ def LoadPersonsExpos(): nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname} person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs) - parseMugShotAndBlurb(personline=personline, header=header, person=person) + parse_blurb(personline=personline, header=header, person=person) # make person expedition from table for year, attended in list(zip(headers, personline))[5:]: |