diff options
Diffstat (limited to 'parsers/people.py')
-rw-r--r-- | parsers/people.py | 52 |
1 files changed, 37 insertions, 15 deletions
diff --git a/parsers/people.py b/parsers/people.py index 4799ebf..c0ffa10 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -3,9 +3,9 @@ import os import re from html import unescape from pathlib import Path +from unidecode import unidecode from django.conf import settings -from unidecode import unidecode from troggle.core.models.troggle import DataIssue, Expedition, Person, PersonExpedition @@ -17,7 +17,9 @@ or they should use the same code by importing a module. def parse_blurb(personline, header, person): - """create mugshot Photo instance""" + """create mugshot Photo instance + Would be better if all this was done before the Person object was created in the db, then it would not + need re-saving (which is slow)""" ms_filename = personline[header["Mugshot"]] ms_path = Path(settings.EXPOWEB, "folk", ms_filename) @@ -60,7 +62,19 @@ def parse_blurb(personline, header, person): person.save() - +slug_cache = {} +def troggle_slugify(longname): + """Uniqueness enforcement too. Yes we have had two "Dave Johnson"s + """ + slug = longname.strip().lower().replace(" ","-") + if len(slug) > 40: # slugfield is 50 chars + slug = slug[:40] + if slug in slug_cache: + slug_cache[slug] += 1 + slug = f"{slug}_{slug_cache[slug]}" + slug_cache[slug] = 1 + return slug + def load_people_expos(): """This is where the folk.csv file is parsed to read people's names. Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names' @@ -86,8 +100,11 @@ def load_people_expos(): for personline in personreader: name = personline[header["Name"]] - name = re.sub(r"<.*?>", "", name) - slug = slugify(name) + name = re.sub(r"<.*?>", "", name) + + match = re.match(r"^([^(]*)(\(([^)]*)\))?", name) # removes nickname in brackets + displayname = match.group(1) + slug = troggle_slugify(displayname) firstname = "" nick = "" @@ -97,34 +114,39 @@ def load_people_expos(): lastname = matchlastname.group(1).strip() splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name) - fullname = splitnick.group(1) - - nick = splitnick.group(2) or "" + fullname = splitnick.group(1) # removes Nickname in brackets, but also cuts hyphenated names + nick = splitnick.group(2) or "" fullname = fullname.strip() - names = fullname.split(" ") + + names = fullname.split(" ") # This may have more than one, e.g. "Adeleide de Diesback" firstname = names[0] if len(names) == 1: - lastname = "" + lastname = "" # wookey special code + + #restore fullname to be the whole string + fullname = displayname if personline[header["VfHO member"]] == "": vfho = False else: vfho = True - coUniqueAttribs = {"first_name": firstname, "last_name": (lastname or "")} - otherAttribs = {"is_vfho": vfho, "fullname": fullname, "nickname": nick} + # would be better to just create the python object, and only cmmit to db once all done inc blurb + # and better to save all the Persons in a bulk update, then do all the PersonExpeditions + coUniqueAttribs = {"slug": slug} + otherAttribs = {"first_name": firstname, "last_name": (lastname or ""), "is_vfho": vfho, "fullname": fullname, "nickname": nick,"is_guest": (personline[header["Guest"]] == "1")} person = Person.objects.create(**otherAttribs, **coUniqueAttribs) - parse_blurb(personline=personline, header=header, person=person) + parse_blurb(personline=personline, header=header, person=person) # saves to db too # make person expedition from table for year, attended in list(zip(headers, personline))[5:]: expedition = Expedition.objects.get(year=year) if attended == "1" or attended == "-1": coUniqueAttribs = {"person": person, "expedition": expedition} - otherAttribs = {"is_guest": (personline[header["Guest"]] == "1")} - pe = PersonExpedition.objects.create(**otherAttribs, **coUniqueAttribs) + # otherAttribs = {"is_guest": (personline[header["Guest"]] == "1")} + pe = PersonExpedition.objects.create(**coUniqueAttribs) print("", flush=True) |