enabled mugshots & blurb in people pages

author: Philip Sargent <philip.sargent@klebos.com> 2021-04-15 17:51:01 +0100
committer: Philip Sargent <philip.sargent@klebos.com> 2021-04-15 17:51:01 +0100
commit: 27491c933a3b676960179448c7c28ba1b788e3e7 (patch)
tree: 61f90f2b20482f17ddaded9b6a7e335be1f0b7be /parsers/people.py
parent: 7124d978d31b839efc57112a9aa2c6d82c2d60d7 (diff)
download: troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.gz
troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.bz2
troggle-27491c933a3b676960179448c7c28ba1b788e3e7.zip
1 files changed, 52 insertions, 23 deletions
diff --git a/parsers/people.py b/parsers/people.py
index 345210c..3f7c02a 100644
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -1,11 +1,13 @@
 import csv, re, datetime, os, shutil
 from html.parser import HTMLParser
 from unidecode import unidecode
+from pathlib import Path
 
 from django.conf import settings
 
 from troggle.core.models.troggle import Expedition, Person, PersonExpedition
-from troggle.core.utils import save_carefully
+from troggle.core.models.troggle import DataIssue
+from troggle.core.utils import save_carefully, TROG
 
 '''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has 
 href links to pages in troggle which troggle does not think are right.
@@ -13,32 +15,59 @@ The standalone script needs to be renedred defucnt, and all the parsing needs to
 or they should use the same code by importing a module.
 '''
 
-def parseMugShotAndBlurb(personline, header, person):
+def parse_blurb(personline, header, person):
     """create mugshot Photo instance"""
-    mugShotFilename=personline[header["Mugshot"]]
-    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
-    if mugShotPath[-3:]=='jpg': #if person just has an image, add it
-        #saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
+    ms_filename = personline[header["Mugshot"]]
+    ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
+    
+    if ms_filename:
+        if not ms_path.is_file():
+            message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
+            print(message)
+            DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
+            return
+    
+    if ms_filename.startswith('i/'):
+        #if person just has an image, add it. It has format 'i/adama2018.jpg'
+        person.mug_shot = str(Path("/folk", ms_filename))
+        person.blurb = None
+
+    elif ms_filename.startswith('l/'): 
+        # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
+        with open(ms_path,'r') as blurbfile:
+            blrb = blurbfile.read()
+        pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
+        if pblurb:
+            person.mug_shot = None           
+            fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1) 
+            fragment = fragment.replace('src="../i/', 'src="/folk/i/')
+            fragment = fragment.replace("src='../i/", "src='/folk/i/")
+            fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
+            # replace src="../i/ with src="/folk/i
+            person.blurb = fragment
+        else:
+            message = f"! Blurb parse error in {ms_filename}"
+            print(message)
+            DataIssue.objects.create(parser='people', message=message, url="/folk/")
+
+    elif ms_filename == '':
         pass
-    elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
-        personPageOld=open(mugShotPath,'r').read()
-        if not person.blurb:
-            pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
-            if pblurb:
-                #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
-                #Only finds the first image, not all of them
-                person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group() 
-            else:
-                print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
-            #for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
-            #    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
-            #    saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
+    else:
+        message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
+        print(message)
+        DataIssue.objects.create(parser='people', message=message, url="/folk/")
+
     person.save()
 
-def LoadPersonsExpos():
+def load_people_expos():
+    '''This is where the folk.csv file is parsed to read people's names. 
+    Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
+    and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
+    '''
+    DataIssue.objects.filter(parser='people').delete()
     
-    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
-    personreader = csv.reader(persontab)
+    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
+    personreader = csv.reader(persontab) # this is an iterator
     headers = next(personreader)
     header = dict(list(zip(headers, list(range(len(headers))))))
     
@@ -86,7 +115,7 @@ def LoadPersonsExpos():
         nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
         person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)
 
-        parseMugShotAndBlurb(personline=personline, header=header, person=person)
+        parse_blurb(personline=personline, header=header, person=person)
     
         # make person expedition from table
         for year, attended in list(zip(headers, personline))[5:]:
author	Philip Sargent <philip.sargent@klebos.com>	2021-04-15 17:51:01 +0100
committer	Philip Sargent <philip.sargent@klebos.com>	2021-04-15 17:51:01 +0100
commit	27491c933a3b676960179448c7c28ba1b788e3e7 (patch)
tree	61f90f2b20482f17ddaded9b6a7e335be1f0b7be /parsers/people.py
parent	7124d978d31b839efc57112a9aa2c6d82c2d60d7 (diff)
download	troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.gz troggle-27491c933a3b676960179448c7c28ba1b788e3e7.tar.bz2 troggle-27491c933a3b676960179448c7c28ba1b788e3e7.zip