summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
Diffstat (limited to 'parsers')
-rw-r--r--parsers/logbooks.py8
-rw-r--r--parsers/people.py11
2 files changed, 13 insertions, 6 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 12124ca..e5817a6 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -25,9 +25,10 @@ from utils import save_carefully
#
# the logbook loading section
#
-def GetTripPersons(trippeople, expedition, logtime_underground):
+def GetTripPersons(trippeople, expedition, logtime_underground):
res = [ ]
author = None
+ round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
@@ -35,6 +36,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
+ tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
print(" - No name match for: '%s'" % tripperson)
@@ -172,8 +174,8 @@ def Parseloghtmltxt(year, expedition, txt):
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
- #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
- #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
+ #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
+ #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
diff --git a/parsers/people.py b/parsers/people.py
index eb877f2..34a5ff3 100644
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -4,6 +4,8 @@ from django.conf import settings
import troggle.core.models as models
import csv, re, datetime, os, shutil
from utils import save_carefully
+from HTMLParser import HTMLParser
+from unidecode import unidecode
def saveMugShot(mugShotPath, mugShotFilename, person):
if mugShotFilename.startswith(r'i/'): #if filename in cell has the directory attached (I think they all do), remove it
@@ -132,11 +134,12 @@ def GetPersonExpeditionNameLookup(expedition):
print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
+ htmlparser = HTMLParser()
for personexpedition in personexpeditions:
possnames = [ ]
- f = personexpedition.person.first_name.lower()
- l = personexpedition.person.last_name.lower()
- full = personexpedition.person.fullname.lower()
+ f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
+ l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
+ full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
if l:
possnames.append(f + " " + l)
possnames.append(f + " " + l[0])
@@ -154,6 +157,8 @@ def GetPersonExpeditionNameLookup(expedition):
possnames.append(personexpedition.nickname.lower() + " " + l)
if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
possnames.append(personexpedition.nickname.lower() + " " + l[0])
+ if str(personexpedition.nickname.lower() + l[0]) not in possnames:
+ possnames.append(personexpedition.nickname.lower() + l[0])
for possname in possnames:
if possname in res: