parsers/people.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

from django.conf import settings
import troggle.core.models as models
import csv, re, datetime, os, shutil
from utils import save_carefully
from html.parser import HTMLParser
from unidecode import unidecode

# def saveMugShot(mugShotPath, mugShotFilename, person):
    # if mugShotFilename.startswith(r'i/'): #if filename in cell has the directory attached (I think they all do), remove it
        # mugShotFilename=mugShotFilename[2:]
    # else:
        # mugShotFilename=mugShotFilename # just in case one doesn't
    
    # dummyObj=models.DPhoto(file=mugShotFilename)
    
    # #Put a copy of the file in the right place. mugShotObj.file.path is determined by the django filesystemstorage specified in models.py
    # if not os.path.exists(dummyObj.file.path):
        # shutil.copy(mugShotPath, dummyObj.file.path)
    
    # mugShotObj, created = save_carefully(
        # models.DPhoto,
        # lookupAttribs={'is_mugshot':True, 'file':mugShotFilename},
        # nonLookupAttribs={'caption':"Mugshot for "+person.first_name+" "+person.last_name}
        # )
    
    # if created:
        # mugShotObj.contains_person.add(person)
        # mugShotObj.save()	

def parseMugShotAndBlurb(personline, header, person):
    """create mugshot Photo instance"""
    mugShotFilename=personline[header["Mugshot"]]
    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
    if mugShotPath[-3:]=='jpg': #if person just has an image, add it
        #saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
        pass
    elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
        personPageOld=open(mugShotPath,'r').read()
        if not person.blurb:
            pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
            if pblurb:
                #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
                #Only finds the first image, not all of them
                person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group() 
            else:
                print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
            #for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
            #    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
            #    saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
    person.save()

def LoadPersonsExpos():
    
    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
    personreader = csv.reader(persontab)
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))
    
    # make expeditions
    print(" - Loading expeditions")
    years = headers[5:]
    
    for year in years:
        lookupAttribs = {'year':year}
        nonLookupAttribs = {'name':"CUCC expo %s" % year}
        
        save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs)

    # make persons
    print(" - Loading personexpeditions")

    for personline in personreader:
        name = personline[header["Name"]]
        name = re.sub(r"<.*?>", "", name)

        firstname = ""
        nickname = ""

        rawlastname = personline[header["Lastname"]].strip()
        matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
        lastname = matchlastname.group(1).strip()

        splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
        fullname = splitnick.group(1)

        nickname = splitnick.group(2) or ""

        fullname = fullname.strip()
        names = fullname.split(' ')
        firstname = names[0]
        if len(names) == 1:
            lastname = ""

        if personline[header["VfHO member"]] =='':
            vfho = False
        else:
            vfho = True

        lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
        nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
        person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs)

        parseMugShotAndBlurb(personline=personline, header=header, person=person)
    
        # make person expedition from table
        for year, attended in list(zip(headers, personline))[5:]:
            expedition = models.Expedition.objects.get(year=year)
            if attended == "1" or attended == "-1":
                lookupAttribs = {'person':person, 'expedition':expedition}
                nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
                save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs)


# used in other referencing parser functions
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
    global Gpersonexpeditionnamelookup
    res = Gpersonexpeditionnamelookup.get(expedition.name)
    if res:
        return res
    
    res = { }
    duplicates = set()
    
    #print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
    htmlparser = HTMLParser()
    for personexpedition in personexpeditions:
        possnames = [ ]
        f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
        l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
        full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
        if l:
            possnames.append(f + " " + l)
            possnames.append(f + " " + l[0])
            possnames.append(f + l[0])
            possnames.append(f[0] + " " + l)
        possnames.append(f)
        if full not in possnames:
            possnames.append(full)
        if personexpedition.nickname not in possnames:
            possnames.append(personexpedition.nickname.lower())
            if l:
                # This allows for nickname to be used for short name eg Phil
                # adding Phil Sargent to the list
                if str(personexpedition.nickname.lower() + " " + l) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l)
                if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l[0])
                if str(personexpedition.nickname.lower() + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + l[0])
        
        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition
        
    for possname in duplicates:
        del res[possname]
    
    Gpersonexpeditionnamelookup[expedition.name] = res
    return res