utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

from django.conf import settings
from django.shortcuts import render
import random, re, logging
from troggle.core.models import CaveDescription

def weighted_choice(lst):
    n = random.uniform(0,1)
    for item, weight in lst:
        if n < weight:
            break
        n = n - weight
    return item

def randomLogbookSentence():
    from troggle.core.models import LogbookEntry
    randSent={}

    # needs to handle empty logbooks without crashing

    #Choose a random logbook entry
    randSent['entry']=LogbookEntry.objects.order_by('?')[0]

    #Choose again if there are no sentances (this happens if it is a placeholder entry)
    while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
        randSent['entry']=LogbookEntry.objects.order_by('?')[0]
    
    #Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
    sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
    randSent['number']=random.randrange(0,len(sentenceList))
    randSent['sentence']=sentenceList[randSent['number']]
           
    return randSent


def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
    """Looks up instance using lookupAttribs and carries out the following:
            -if instance does not exist in DB: add instance to DB, return (new instance, True)
            -if instance exists in DB and was modified using Troggle: do nothing, return (existing instance, False)
            -if instance exists in DB and was not modified using Troggle: overwrite instance, return (instance, False)
            
        The checking is accomplished using Django's get_or_create and the new_since_parsing boolean field
        defined in core.models.TroggleModel.
    
    """

    instance, created=objectType.objects.get_or_create(defaults=nonLookupAttribs, **lookupAttribs)

    if not created and not instance.new_since_parsing:
        for k, v in list(nonLookupAttribs.items()): #overwrite the existing attributes from the logbook text (except date and title)
            setattr(instance, k, v)
        instance.save()
    
    if created:
        logging.info(str(instance) + ' was just added to the database for the first time. \n')
    
    if not created and instance.new_since_parsing:
        logging.info(str(instance) + " has been modified using Troggle, so the current script left it as is. \n")

    if not created and not instance.new_since_parsing:
        logging.info(str(instance) + " existed in the database unchanged since last parse. It was overwritten by the current script. \n")
    return (instance, created)
    
re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)

def get_html_body(text):
    return get_single_match(re_body, text)

def get_html_title(text):
    return get_single_match(re_title, text)

def get_single_match(regex, text):
    match = regex.search(text)

    if match:
        return match.groups()[0]
    else:
        return None

def href_to_wikilinks(matchobj):
    """
    Given an html link, checks for possible valid wikilinks.
    
    Returns the first valid wikilink. Valid means the target
    object actually exists.
    """
    res=CaveDescription.objects.filter(long_name__icontains=matchobj.groupdict()['text'])
    if res and res[0]:
        return r'[[cavedescription:'+res[0].short_name+'|'+res[0].long_name+']]'
    else:
        return matchobj.group()
    #except:
        #print 'fail'
    

re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
           (re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
           (re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
           (re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
           (re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
           (re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
           (re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
           (re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
           (re.compile(r'(<a href="?(?P<target>.*)"?>)?<img class="?(?P<class>\w*)"? src="?t/?(?P<source>[\w/\.]*)"?(?P<rest>></img>|\s/>(</a>)?)', re.DOTALL),r'[[display:\g<class> photo:\g<source>]]'), #
           (re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[[subcave:\1|\2]]"), #assumes that all links with id attributes are subcaves. Not great.
           #interpage link needed
           (re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[[cavedescription:\1|\2]]"), #assumes that all links with target ids are cave descriptions. Not great.
           (re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"[[qm:\1]]"),
           (re.compile(r'<a\shref="?(?P<target>.*)"?>(?P<text>.*)</a>'),href_to_wikilinks),
           
           ]

def html_to_wiki(text, codec = "utf-8"):
    if type(text) == str:
        text = str(text, codec)
    text = re.sub("</p>", r"", text)
    text = re.sub("<p>$", r"", text)
    text = re.sub("<p>", r"\n\n", text)
    out = ""
    lists = ""
    #lists
    while text:
        mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
        mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
        def min_(i, l):
            try:
                v = i.groups()[0]
                l.remove(len(v))
                return len(v) < min(l, 1000000000)
            except:
                return False
        if min_(mstar, ms):
            lists += "*"
            pre, val, post = mstar.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mhash, ms):
            lists += "#"
            pre, val, post = mhash.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mitem, ms):
            pre, val, post = mitem.groups()
            out += "\n" + lists + " " + val
            text = post
        elif min_(munstar, ms):
            lists = lists[:-1]
            text = munstar.groups()[1]
        elif min_(munhash, ms):
            lists.pop()
            text = munhash.groups()[1]
        else:
            out += text
            text = ""
    #substitutions
    for regex, repl in re_subs:
        out = regex.sub(repl, out)
    return out