parsers/cavesM.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

import troggle.core.models as models #import models for various objects
from django.conf import settings
import xml.etree.ElementTree as ET #this is used to parse XML's
import subprocess
import re

#
#    This parser has to find several things:
#    There are files of .html format in expoweb area - they contain some of the important information
#    There is a similar number of .svx files in loser are - they contain all the measurements
#
#    Previous version was incredibly slow due to various shitty ideas about finding things 
#    and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
#    and handles more sophisticated bits only
#

def load():
    print('Hi! I\'m caves parser. Ready to work')
    
    print('Loading caves of 1623 area')
    loadarea('1623')


def loadarea(areacode):

    if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'):
        print('Computing master .3d file')
        bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')
    else:
        print('Loading from existing master .3d file')

    master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines()
    master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file  
    master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file      

    print('Searching all cave dirs files')
    basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'

    cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
    print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
    ndirs = len(cavedirs) #remember number of dirs for nice debug output

    for cavedir in cavedirs:
        if cavedir==basedir:
            continue #skip the basedir - a non-proper subdirectory
        cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
        
        test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
        if not file_exists(cavedir+'/'+cavename+'.svx'):
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
            print('Cave missing'+cavename+' :(')
            msg.save()
            continue
        fullname=cavedir+'/'+cavename+'.svx'        
        print('Found cave:'+cavename)
        cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing
        if 'cavern: error:' in cavernout:
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
            print('Fucked svx'+cavename+' :(')
            msg.save()
            continue
        
        cavernout = cavernout.splitlines()
        depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
        length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
        cavefile = open(fullname,'r')
        cavefilecontents = cavefile.read().splitlines()
        surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower()          
        try:            
            title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1]              
        except:
            syrveyname = "Untitled"

        relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))] 
        entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x]
        surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x]
        location_nodes = []
        print('rel_nodes'+str(len(relevant_nodes)))
        if len(entrance_nodes) > 0:
            location_nodes = entrance_nodes
        elif len(surface_nodes) > 0:
            location_nodes = surface_nodes
        elif len(relevant_nodes) > 0:
            location_nodes = relevant_nodes

        try:
            location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop()
        except:
            print(location_nodes)
            location = 'Not found'
        
        relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
        try:
            lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop()
        except:
            lastleg = ['LINE 1900.01.01']
        try:        
            lastdate = lastleg.split().pop()
            if 'STYLE' in lastdate:
                lastdate = lastleg.split().pop().pop()
        except:
            lastdate = '1900.01.01'
        
        entrance = ' '.join(location.split()[1:3])
        print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print
                
        
        newcave =  models.CaveM(
            survex_file = fullname,
            total_length = length,
            name=areacode+'.'+surveyname,
            total_depth = depth,
            date = lastdate,
            entrance = entrance)
        newcave.save()
    #end of reading survex masterfiles
    
    print ("Reading cave descriptions")
    cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
    for fn in cavefiles:
        f = open(fn, "r")
        print(fn)
        contents = f.read()    
        
        slug = re.sub(r"\s+", "", extractXML(contents,'caveslug'))
        desc = extractXML(contents,'underground_description')
        name = slug[5:] #get survex compatible name
        area = slug[0:4]
        
        print([area,name])
        
        if desc==None or name==None:
            msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
            print('Fucked description '+fn+' :(')
            msg.save()
            continue

            print(area+'/'+name+'/'+name+'.svx')
        
        updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx')
        if len(updatecave)>1:
            print('Non unique solution - skipping. Name:'+name)
        elif len(updatecave)==0:
            print('Cave with no survex data:'+name)
            continue
        else: #exaclty one match
            print('Adding desc:'+name)
            updatecave = updatecave[0]
            updatecave.description = '/cave/descriptionM/'+slug #area-name
            updatecave.title=name
            updatecave.save()

            slugS = slug
            explorersS = extractXML(contents,'explorers')
            underground_descriptionS = extractXML(contents,'underground_description')
            equipmentS = extractXML(contents,'equipment')
            referencesS = extractXML(contents,'references')
            surveyS = extractXML(contents,'survey')
            kataster_statusS = extractXML(contents,'kataster_status')
            underground_centre_lineS = extractXML(contents,'underground_centre_line')
            survex_fileS = extractXML(contents,'survex_file')
            notesS = extractXML(contents,'notes')


            newcavedesc =  models.Cave_descriptionM(
            slug = slugS,            
            explorers = explorersS,
            underground_description = underground_descriptionS, 
            equipment = equipmentS, 
            references = referencesS, 
            survey = surveyS, 
            kataster_status = kataster_statusS, 
            underground_centre_line = underground_centre_lineS, 
            survex_file = survex_fileS, 
            notes = notesS)
            newcavedesc.save()
            

    #end of reading cave descriptions
    
def file_exists(filename):
    test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence
    if 'MISSING' in test: #send error message to the database
        return False
    return True       

def extractXML(contents,tag):
    #find correct lines
    lines = contents.splitlines()
    beg = [x for x in lines if ('<'+tag+'>' in x)]
    end = [x for x in lines if ('</'+tag+'>' in x)]
    if (not beg) or (not end):
        return None       
    begi = lines.index(beg[0])
    endi = lines.index(end[0])
    if endi!=begi:
        segment = '\n'.join(lines[begi:endi+1])
    else:
        segment = lines[begi:endi+1][0]

    hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0]
    return hit

def bash(cmd): #calls command in bash shell, returns output
    process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    output, error = process.communicate()
    return output