Vital fix to stop parsing terminating too early

author: Philip Sargent <philip.sargent@gmail.com> 2022-12-19 11:38:34 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2022-12-19 11:38:34 +0000
commit: 7e9bb737771bb031d7db7864a5267b75da8e08c0 (patch)
tree: 0b0f83433cad20baa8ddcdebfebacd15bc424fbd /parsers/logbooks.py
parent: 43a98b4421a2be343007bcf6722982cf8c86370f (diff)
download: troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.gz
troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.bz2
troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.zip
1 files changed, 16 insertions, 5 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index aa4ec92..e37780c 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,6 @@
 import csv
 import os
 import re
-# import pickle
-# import shelve
 import time
 from random import randint
 from datetime import datetime, date
@@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
 
 '''
 Parses and imports logbooks in all their wonderful confusion
-
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+   See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 '''
 todo='''
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
@@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
         
 # 2002, 2004 - now
 def parser_html(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
     global logentries
     global logdataissues
 
@@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def parser_html_01(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
     global logentries
     global logdataissues
     errorcount = 0
@@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
 def parser_blog(year, expedition, txt, sq=""):
     '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
     Note that the entries have dates and authors, but no titles.
+    See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
+    https://expo.survex.com/handbook/computing/log-blog-parsing.html
+    
+    This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
     '''
     global logentries
     global logdataissues
@@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
         message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
         print(message)
 
+    # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
     tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
     if not ( tripparas ) :
         message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
         tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
         tripcontent = re.sub(r"width: \d+px","",tripcontent)
         tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
+        tripcontent = re.sub(r"<hr\s*>","",tripcontent)
         tripcontent =  f"\n\nBlog Author: {trippeople}" + tripcontent
 
         entrytuple = (tripdate, location, tripname, tripcontent,
author	Philip Sargent <philip.sargent@gmail.com>	2022-12-19 11:38:34 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2022-12-19 11:38:34 +0000
commit	7e9bb737771bb031d7db7864a5267b75da8e08c0 (patch)
tree	0b0f83433cad20baa8ddcdebfebacd15bc424fbd /parsers/logbooks.py
parent	43a98b4421a2be343007bcf6722982cf8c86370f (diff)
download	troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.gz troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.tar.bz2 troggle-7e9bb737771bb031d7db7864a5267b75da8e08c0.zip