From 3afb94f5d24bf86b432e16a40de08a5e5e20b0f0 Mon Sep 17 00:00:00 2001 From: martin speleo Date: Sat, 4 Jul 2009 16:42:17 +0100 Subject: [svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. --- parsers/cavetab.py | 80 +----------------------------------------------------- 1 file changed, 1 insertion(+), 79 deletions(-) (limited to 'parsers/cavetab.py') diff --git a/parsers/cavetab.py b/parsers/cavetab.py index 0c7b985..20c7658 100644 --- a/parsers/cavetab.py +++ b/parsers/cavetab.py @@ -3,6 +3,7 @@ import troggle.core.models as models from django.conf import settings import csv, time, re, os, logging from utils import save_carefully +from utils import html_to_wiki ##format of CAVETAB2.CSV is KatasterNumber = 0 @@ -52,85 +53,6 @@ MarkingComment = 43 Findability = 44 FindabilityComment = 45 - -def html_to_wiki(text): - if type(text) != str: - return text - text = unicode(text, "utf-8") - #Characters - #text = re.sub("ü", u"\xfc", text) - #text = re.sub("ö", u"\xf6", text) - #text = re.sub("ä", u"\xe4", text) - #text = re.sub("°", u"\xb0", text) - #text = re.sub("©", u"\xa9", text) - #text = re.sub("&", u"\x26", text) - #text = re.sub("ß", u"\xdf", text) - #text = re.sub("ß", u"\xdf", text) - #text = re.sub("<", u"<", text) - #text = re.sub(">", u">", text) - #text = re.sub("è", u"\xe8", text) - #text = re.sub("é", u"\xe9", text) - #text = re.sub(""e;", u'"', text) - #text = re.sub(""", u'"', text) - #text = re.sub("Ö", u'\xd6', text) - #text = re.sub("×", u'"', text) - - #text = re.sub("&(.*);", "/1", text) - #if s: - # print s.groups() - #Lists - text = re.sub("

", r"", text) - text = re.sub("

$", r"", text) - text = re.sub("

", r"\n\n", text) - out = "" - lists = "" - while text: - mstar = re.match("^(.*?)

(.*)$", text, re.DOTALL) - mhash = re.match("^(.*?)
    \s*]*>(.*?)(.*)$", text, re.DOTALL) - munhash = re.match("^(\s*)
(.*)$", text, re.DOTALL) - mitem = re.match("^(\s*)]*>(.*?)(.*)$", text, re.DOTALL) - ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m] - def min_(i, l): - try: - v = i.groups()[0] - l.remove(len(v)) - return len(v) < min(l, 1000000000) - except: - return False - if min_(mstar, ms): - lists += "*" - pre, val, post = mstar.groups() - out += pre + "\n" + lists + " " + val - text = post - elif min_(mhash, ms): - lists += "#" - pre, val, post = mhash.groups() - out += pre + "\n" + lists + " " + val - text = post - elif min_(mitem, ms): - pre, val, post = mitem.groups() - out += "\n" + lists + " " + val - text = post - elif min_(munstar, ms): - lists = lists[:-1] - text = munstar.groups()[1] - elif min_(munhash, ms): - lists.pop() - text = munhash.groups()[1] - else: - out += text - text = "" - text2 = out - while text2: - mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL) - if mtag: - text2 = mtag.groups()[2] - print mtag.groups()[1] - else: - text2 = "" - return out - def LoadCaveTab(): cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU') caveReader = csv.reader(cavetab) -- cgit v1.2.3