parsers/cavesM.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

import troggle.core.models as models #import models for various objects
from django.conf import settings
import xml.etree.ElementTree as ET #this is used to parse XML's
import subprocess
import re

#
#    This parser has to find several things:
#    There are files of .html format in expoweb area - they contain some of the important information
#    There is a similar number of .svx files in loser are - they contain all the measurements
#
#    Previous version was incredibly slow due to various shitty ideas about finding things 
#    and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
#    and handles more sophisticated bits only
#

def load():
    print('Hi! I\'m caves parser. Ready to work')
    
    print('Loading caves of 1623 area')
    loadarea('caves-1623/')


def loadarea(areacode):


    print('Searching all cave dirs files')
    basedir = settings.SURVEX_DATA+areacode

    bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')

    cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
    print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
    ndirs = len(cavedirs) #remember number of dirs for nice debug output

    for cavedir in cavedirs:
        if cavedir==basedir:
            continue #skip the basedir - a non-proper subdirectory
        cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
        
        test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
        if 'MISSING' in test: #send error message to the database
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
            print('Cave missing'+cavename+' :(')
            msg.save()
            continue
        fullname=cavedir+'/'+cavename+'.svx'        
        print('Found cave:'+cavename)
        cavernout = bash('cavern -q '+fullname) #make cavern process the thing
        if 'cavern: error:' in cavernout:
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
            print('Fucked svx'+cavename+' :(')
            msg.save()
            continue
        
        cavernout = cavernout.splitlines()
        depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
        length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
        surveyname = bash('cat '+fullname+' | grep \'\*begin\' | head -n1 | cut -f2 -d \' \' ').splitlines().pop()
        title = (bash('cat '+fullname+' | grep \'\*title\' | head -n1 | cut -f2 -d \' \' ').splitlines() or ["Not found"])[0]      
        print((('depth','length','surv name'),(depth,length,surveyname)))
        print('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[\\.'+surveyname+'.*\\]\'')        
        nodes = bash('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[.*\\.'+surveyname+'.*\\]\'').splitlines()
        entran = [x for x in nodes if ('ENTRANCE' in x) ]
        print(nodes)


        newcave =  models.CaveM(survex_file = fullname, total_length = length, name=title, total_depth = depth)
        newcave.save()
    #end of reading survex masterfiles

    print ("Reading cave descriptions")
    cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
    for fn in cavefiles:
        f = open(fn, "r")
        print(fn)
        contents = f.read()    

        desc = extractXML(contents,'underground_description')
        name = re.search(r'>.*<',extractXML(contents,'caveslug')).group()[6:-1]
        
        if desc==None or name==None:
            msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
            print('Fucked description '+fn+' :(')
            msg.save()
            continue

        
        updatecave = models.CaveM.objects.filter(survex_file__icontains='/'+name+'.svx')
        if len(updatecave)>1:
            print('Non unique solution - skipping. Name:'+name)
        elif len(updatecave)==0:
            print('Cave with no survex data'+name)
            newcave =  models.CaveM(description = desc, name = name)
            newcave.save()
        else: #exaclty one match
            updatecave = updatecave[0]
            updatecave.description = desc
            if updatecave.name=="Not found":
                updatecave.name=name
            updatecave.title=name
            updatecave.save()
        

    #end of reading cave descriptions
    
        
def extractXML(contents,tag):
    #find correct lines
    lines = contents.splitlines()
    beg = [x for x in lines if ('<'+tag+'>' in x)]
    end = [x for x in lines if ('</'+tag+'>' in x)]
    if (not beg) or (not end):
        return None       
    begi = lines.index(beg[0])
    endi = lines.index(end[0])
    if endi!=begi:
        segment = '\n'.join(lines[begi:endi+1])
    else:
        segment = lines[begi:endi+1]
    return segment[0]
    

def bash(cmd): #calls command in bash shell, returns output
    process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    output, error = process.communicate()
    return output