summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@gmail.com>2022-12-19 11:38:34 +0000
committerPhilip Sargent <philip.sargent@gmail.com>2022-12-19 11:38:34 +0000
commit7e9bb737771bb031d7db7864a5267b75da8e08c0 (patch)
tree0b0f83433cad20baa8ddcdebfebacd15bc424fbd /parsers/logbooks.py
parent43a98b4421a2be343007bcf6722982cf8c86370f (diff)
downloadtroggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.gz
troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.bz2
troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.zip
Vital fix to stop parsing terminating too early
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py21
1 files changed, 16 insertions, 5 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index aa4ec92..e37780c 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,6 @@
import csv
import os
import re
-# import pickle
-# import shelve
import time
from random import randint
from datetime import datetime, date
@@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
'''
Parses and imports logbooks in all their wonderful confusion
-
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+ See detailed explanation of the complete process:
+ https://expo.survex.com/handbook/computing/logbooks-parsing.html
'''
todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
@@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
# 2002, 2004 - now
def parser_html(year, expedition, txt, seq=""):
+ '''This uses some of the more obscure capabilities of regular expressions,
+ see https://docs.python.org/3/library/re.html
+ '''
global logentries
global logdataissues
@@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""):
+ '''This uses some of the more obscure capabilities of regular expressions,
+ see https://docs.python.org/3/library/re.html
+ '''
global logentries
global logdataissues
errorcount = 0
@@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.
+ See detailed explanation of the complete process:
+ https://expo.survex.com/handbook/computing/logbooks-parsing.html
+ https://expo.survex.com/handbook/computing/log-blog-parsing.html
+
+ This uses some of the more obscure capabilities of regular expressions,
+ see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
@@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
print(message)
+ # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
if not ( tripparas ) :
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
tripcontent = re.sub(r"width: \d+px","",tripcontent)
tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
+ tripcontent = re.sub(r"<hr\s*>","",tripcontent)
tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent
entrytuple = (tripdate, location, tripname, tripcontent,