diff options
author | martin speleo <martin.speleo@gmail.com> | 2009-07-04 16:42:17 +0100 |
---|---|---|
committer | martin speleo <martin.speleo@gmail.com> | 2009-07-04 16:42:17 +0100 |
commit | 3afb94f5d24bf86b432e16a40de08a5e5e20b0f0 (patch) | |
tree | d4c0ba254b673f11b977c3c380ea97ea4eb31c7b /parsers/cavetab.py | |
parent | 29f084613dffbdfa005258218e9e38c0c4a44bbb (diff) | |
download | troggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.tar.gz troggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.tar.bz2 troggle-3afb94f5d24bf86b432e16a40de08a5e5e20b0f0.zip |
[svn] Work on turn html pages into cavedescription models.py.
Moved parser/cavetabs html_to_wiki function to utils.py
Added databaseReset.py desc to refresh the cavedescriptions.
Diffstat (limited to 'parsers/cavetab.py')
-rw-r--r-- | parsers/cavetab.py | 80 |
1 files changed, 1 insertions, 79 deletions
diff --git a/parsers/cavetab.py b/parsers/cavetab.py index 0c7b985..20c7658 100644 --- a/parsers/cavetab.py +++ b/parsers/cavetab.py @@ -3,6 +3,7 @@ import troggle.core.models as models from django.conf import settings
import csv, time, re, os, logging
from utils import save_carefully
+from utils import html_to_wiki
##format of CAVETAB2.CSV is
KatasterNumber = 0
@@ -52,85 +53,6 @@ MarkingComment = 43 Findability = 44
FindabilityComment = 45
-
-def html_to_wiki(text):
- if type(text) != str:
- return text
- text = unicode(text, "utf-8")
- #Characters
- #text = re.sub("ü", u"\xfc", text)
- #text = re.sub("ö", u"\xf6", text)
- #text = re.sub("ä", u"\xe4", text)
- #text = re.sub("°", u"\xb0", text)
- #text = re.sub("©", u"\xa9", text)
- #text = re.sub("&", u"\x26", text)
- #text = re.sub("ß", u"\xdf", text)
- #text = re.sub("ß", u"\xdf", text)
- #text = re.sub("<", u"<", text)
- #text = re.sub(">", u">", text)
- #text = re.sub("è", u"\xe8", text)
- #text = re.sub("é", u"\xe9", text)
- #text = re.sub(""e;", u'"', text)
- #text = re.sub(""", u'"', text)
- #text = re.sub("Ö", u'\xd6', text)
- #text = re.sub("×", u'"', text)
-
- #text = re.sub("&(.*);", "/1", text)
- #if s:
- # print s.groups()
- #Lists
- text = re.sub("</p>", r"", text)
- text = re.sub("<p>$", r"", text)
- text = re.sub("<p>", r"\n\n", text)
- out = ""
- lists = ""
- while text:
- mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
- mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
- mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
- ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
- def min_(i, l):
- try:
- v = i.groups()[0]
- l.remove(len(v))
- return len(v) < min(l, 1000000000)
- except:
- return False
- if min_(mstar, ms):
- lists += "*"
- pre, val, post = mstar.groups()
- out += pre + "\n" + lists + " " + val
- text = post
- elif min_(mhash, ms):
- lists += "#"
- pre, val, post = mhash.groups()
- out += pre + "\n" + lists + " " + val
- text = post
- elif min_(mitem, ms):
- pre, val, post = mitem.groups()
- out += "\n" + lists + " " + val
- text = post
- elif min_(munstar, ms):
- lists = lists[:-1]
- text = munstar.groups()[1]
- elif min_(munhash, ms):
- lists.pop()
- text = munhash.groups()[1]
- else:
- out += text
- text = ""
- text2 = out
- while text2:
- mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
- if mtag:
- text2 = mtag.groups()[2]
- print mtag.groups()[1]
- else:
- text2 = ""
- return out
-
def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab)
|