parsers/people.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296

import csv, re, datetime, os, shutil
from html import unescape
from unidecode import unidecode
from pathlib import Path

from django.conf import settings

from troggle.core.models.troggle import Expedition, Person, PersonExpedition
from troggle.core.models.troggle import DataIssue
from troggle.core.utils import save_carefully, TROG

'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has 
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
'''

def parse_blurb(personline, header, person):
    """create mugshot Photo instance"""
    ms_filename = personline[header["Mugshot"]]
    ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
    
    if ms_filename:
        if not ms_path.is_file():
            message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
            print(message)
            DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
            return
    
    if ms_filename.startswith('i/'):
        #if person just has an image, add it. It has format 'i/adama2018.jpg'
        person.mug_shot = str(Path("/folk", ms_filename))
        person.blurb = None

    elif ms_filename.startswith('l/'): 
        # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
        with open(ms_path,'r') as blurbfile:
            blrb = blurbfile.read()
        pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
        if pblurb:
            person.mug_shot = None           
            fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1) 
            fragment = fragment.replace('src="../i/', 'src="/folk/i/')
            fragment = fragment.replace("src='../i/", "src='/folk/i/")
            fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
            # replace src="../i/ with src="/folk/i
            person.blurb = fragment
        else:
            message = f"! Blurb parse error in {ms_filename}"
            print(message)
            DataIssue.objects.create(parser='people', message=message, url="/folk/")

    elif ms_filename == '':
        pass
    else:
        message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
        print(message)
        DataIssue.objects.create(parser='people', message=message, url="/folk/")

    person.save()

def load_people_expos():
    '''This is where the folk.csv file is parsed to read people's names. 
    Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
    and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
    '''
    DataIssue.objects.filter(parser='people').delete()
    
    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
    personreader = csv.reader(persontab) # this is an iterator
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))
    
    # make expeditions
    print(" - Loading expeditions")
    years = headers[5:]
    
    for year in years:
        lookupAttribs = {'year':year}
        nonLookupAttribs = {'name':"CUCC expo %s" % year}
        
        save_carefully(Expedition, lookupAttribs, nonLookupAttribs)

    # make persons
    print(" - Loading personexpeditions")

    for personline in personreader:
        name = personline[header["Name"]]
        name = re.sub(r"<.*?>", "", name)

        firstname = ""
        nickname = ""

        rawlastname = personline[header["Lastname"]].strip()
        matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
        lastname = matchlastname.group(1).strip()

        splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
        fullname = splitnick.group(1)

        nickname = splitnick.group(2) or ""

        fullname = fullname.strip()
        names = fullname.split(' ')
        firstname = names[0]
        if len(names) == 1:
            lastname = ""

        if personline[header["VfHO member"]] =='':
            vfho = False
        else:
            vfho = True

        lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
        nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname, 'nickname':nickname}
        person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)

        parse_blurb(personline=personline, header=header, person=person)
    
        # make person expedition from table
        for year, attended in list(zip(headers, personline))[5:]:
            expedition = Expedition.objects.get(year=year)
            if attended == "1" or attended == "-1":
                lookupAttribs = {'person':person, 'expedition':expedition}
                nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
                save_carefully(PersonExpedition, lookupAttribs, nonLookupAttribs)
    print("", flush=True)

def who_is_this(year,possibleid):
    expo = Expedition.objects.filter(year=year)
    personexpedition =  GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
    if personexpedition:
        return personexpedition.person
    else:
        return None
        
def known_foreigner(id):
    '''If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
    '''
    friends = ["P. Jeutter", "K. Jäger", "S. Steinberger", "R. Seebacher", 
        "Dominik Jauch", "Fritz Mammel", "Marcus Scheuerman", 
        "Uli Schütz", "Wieland Scheuerle",
        "Kai Schwekend", "Regina Kaiser", "Thilo Müller","Wieland Scheuerle",
        "Florian Gruner", "Helmut Stopka-Ebeler", "Aiko", "Mark Morgan"]

    if id in friends:
        return True
    else:
        return False

    
# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the whole personexpedition concept is unnecessary?

Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
    global Gpersonexpeditionnamelookup
    
    def apply_variations(f, l):
        '''Be generous in guessing possible matches. Any duplicates will be ruled as invalid.
        '''
        f = f.lower()
        l = l.lower()
        variations = []
        variations.append(f)
        variations.append(l)
        variations.append(f + l)
        variations.append(f + " " + l)
        variations.append(f + " " + l[0])
        variations.append(f + l[0])
        variations.append(f + " " +l[0] + '.')
        variations.append(f[0] + " " + l)
        variations.append(f[0] + ". " + l)
        variations.append(f[0] + l)
        variations.append(f[0] + l[0]) # initials e.g. gb or bl
        return variations
    
    res = Gpersonexpeditionnamelookup.get(expedition.name)
    
    if res:
        return res
    
    res = { }
    duplicates = set()
    
    #print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
    short = {}
    dellist = []
    for personexpedition in personexpeditions:
        possnames = [ ]
        f = unidecode(unescape(personexpedition.person.first_name.lower()))
        l = unidecode(unescape(personexpedition.person.last_name.lower()))
        full = unidecode(unescape(personexpedition.person.fullname.lower()))
        n = unidecode(unescape(personexpedition.nickname.lower()))
        if full not in possnames:
            possnames.append(full)
        if n not in possnames:
            possnames.append(n)
        
        if l:
            possnames += apply_variations(f,l)

            if n:
                possnames += apply_variations(n, l)
                
            if f == "Robert".lower():
                possnames += apply_variations("Bob", l)
            if f == "Rob".lower():
                possnames += apply_variations("Robert", l)
                
            if f == "Andrew".lower():
                possnames += apply_variations("Andy", l)
            if f == "Andy".lower():
                possnames += apply_variations("Andrew", l)
            if f == "Michael".lower():
                possnames += apply_variations("Mike", l)
                
            if f == "David".lower():
                possnames += apply_variations("Dave", l)
            if f == "Dave".lower():
                possnames += apply_variations("David", l)
                
            if f == "Peter".lower():
                possnames += apply_variations("Pete", l)
            if f == "Pete".lower():
                possnames += apply_variations("Peter", l)
                
            if f == "Olly".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Olly", l)
               
            if f == "Ollie".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Ollie", l)

            if f == "Becka".lower():
                possnames += apply_variations("Rebecca", l)
         
            if f'{f} {l}' == "Andy Waddington".lower():
                possnames += apply_variations("aer", "waddington")
            if f'{f} {l}' == "Phil Underwood".lower():
                possnames += apply_variations("phil", "underpants")
            if f'{f} {l}' == "Naomi Griffiths".lower():
                possnames += apply_variations("naomi", "makins")
            if f'{f} {l}' == "Tina White".lower():
                possnames += apply_variations("tina", "richardson")
            if f'{f} {l}' == "Cat Hulse".lower():
                possnames += apply_variations("catherine", "hulse")
                possnames += apply_variations("cat", "henry")
            if f'{f} {l}' == "Jess Stirrups".lower():
                possnames += apply_variations("jessica", "stirrups")
            if f'{f} {l}' == "Nat Dalton".lower():
                possnames += apply_variations("nathanael", "dalton") # correct. He has a weird spelling.
            if f'{f} {l}' == "Mike Richardson".lower():
                possnames.append("mta")
                possnames.append("miketa")
                possnames.append("mike the animal")
                possnames.append("animal")
            if f'{f} {l}' == "Eric Landgraf".lower():
                possnames.append("eric c.landgraf")
                possnames.append("eric c. landgraf")
                possnames.append("eric c landgraf")
            if f'{f} {l}' == "Nadia Raeburn".lower():
                possnames.append("nadia rc")
                possnames.append("nadia raeburn-cherradi")
             
        for i in [3, 4, 5, 6]:
            lim = min(i, len(f)+1)  # short form, e.g. Dan for Daniel. 
            if f[:lim] not in short:
                short[f[:lim]]= personexpedition 
            else:
                dellist.append(f[:lim])
 
        possnames = set(possnames) # remove duplicates
        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition
        
    for possname in duplicates:
        del res[possname]
        
    for possname in dellist:
        if possname in short: #always true ?
            del short[possname]
    for shortname in short:
        res[shortname] = short[shortname]
        
    
    Gpersonexpeditionnamelookup[expedition.name] = res
    return res