summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authormartin speleo <martin.speleo@gmail.com>2009-07-04 16:42:17 +0100
committermartin speleo <martin.speleo@gmail.com>2009-07-04 16:42:17 +0100
commit3afb94f5d24bf86b432e16a40de08a5e5e20b0f0 (patch)
treed4c0ba254b673f11b977c3c380ea97ea4eb31c7b /parsers
parent29f084613dffbdfa005258218e9e38c0c4a44bbb (diff)
downloadtroggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.tar.gz
troggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.tar.bz2
troggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.zip
[svn] Work on turn html pages into cavedescription models.py.
Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions.
Diffstat (limited to 'parsers')
-rw-r--r--parsers/cavetab.py80
1 files changed, 1 insertions, 79 deletions
diff --git a/parsers/cavetab.py b/parsers/cavetab.py
index 0c7b985..20c7658 100644
--- a/parsers/cavetab.py
+++ b/parsers/cavetab.py
@@ -3,6 +3,7 @@ import troggle.core.models as models
from django.conf import settings
import csv, time, re, os, logging
from utils import save_carefully
+from utils import html_to_wiki
##format of CAVETAB2.CSV is
KatasterNumber = 0
@@ -52,85 +53,6 @@ MarkingComment = 43
Findability = 44
FindabilityComment = 45
-
-def html_to_wiki(text):
- if type(text) != str:
- return text
- text = unicode(text, "utf-8")
- #Characters
- #text = re.sub("&uuml;", u"\xfc", text)
- #text = re.sub("&ouml;", u"\xf6", text)
- #text = re.sub("&auml;", u"\xe4", text)
- #text = re.sub("&deg;", u"\xb0", text)
- #text = re.sub("&copy;", u"\xa9", text)
- #text = re.sub("&amp;", u"\x26", text)
- #text = re.sub("&szlig;", u"\xdf", text)
- #text = re.sub("&szlig;", u"\xdf", text)
- #text = re.sub("&lt;", u"<", text)
- #text = re.sub("&gt;", u">", text)
- #text = re.sub("&egrave;", u"\xe8", text)
- #text = re.sub("&eacute;", u"\xe9", text)
- #text = re.sub("&quote;", u'"', text)
- #text = re.sub("&quot;", u'"', text)
- #text = re.sub("&Ouml;", u'\xd6', text)
- #text = re.sub("&times;", u'"', text)
-
- #text = re.sub("&(.*);", "/1", text)
- #if s:
- # print s.groups()
- #Lists
- text = re.sub("</p>", r"", text)
- text = re.sub("<p>$", r"", text)
- text = re.sub("<p>", r"\n\n", text)
- out = ""
- lists = ""
- while text:
- mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
- mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
- mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
- def min_(i, l):
- try:
- v = i.groups()[0]
- l.remove(len(v))
- return len(v) < min(l, 1000000000)
- except:
- return False
- if min_(mstar, ms):
- lists += "*"
- pre, val, post = mstar.groups()
- out += pre + "\n" + lists + " " + val
- text = post
- elif min_(mhash, ms):
- lists += "#"
- pre, val, post = mhash.groups()
- out += pre + "\n" + lists + " " + val
- text = post
- elif min_(mitem, ms):
- pre, val, post = mitem.groups()
- out += "\n" + lists + " " + val
- text = post
- elif min_(munstar, ms):
- lists = lists[:-1]
- text = munstar.groups()[1]
- elif min_(munhash, ms):
- lists.pop()
- text = munhash.groups()[1]
- else:
- out += text
- text = ""
- text2 = out
- while text2:
- mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
- if mtag:
- text2 = mtag.groups()[2]
- print mtag.groups()[1]
- else:
- text2 = ""
- return out
-
def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab)