summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
authorPhilip Sargent <philip.sargent@gmail.com>2023-09-02 17:49:37 +0300
committerPhilip Sargent <philip.sargent@gmail.com>2023-09-02 17:49:37 +0300
commit1a8bc17f806d88b06aeabc28c11a6da199216fe2 (patch)
treef02f5038db4b02d78cf5fbf09bea91bb3a287966 /parsers/logbooks.py
parentc9729c046ccccfd5e858f1fe8fbf24619d156e30 (diff)
downloadtroggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.gz
troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.bz2
troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.zip
Fixed parsers
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py66
1 files changed, 41 insertions, 25 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index fdc68ad..ad92931 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -60,8 +60,8 @@ LOGBOOK_PARSER_SETTINGS = {
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
ENTRIES = {
- "2023": 81,
- "2022": 93,
+ "2023": 83,
+ "2022": 94,
"2019": 55,
"2018": 95,
"2017": 74,
@@ -127,7 +127,7 @@ def reset_trip_id(date):
suffix = alphabet_suffix(n)
tid = f"{date}{suffix}"
- # print(tid)
+ # print(already, n, tid)
return tid
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
@@ -136,6 +136,7 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = []
author = None
+ guests = []
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
@@ -154,11 +155,12 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
try:
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
- if known_foreigner(tripperson):
- message = f" ! - {expedition.year} Known foreigner: '{tripperson}' in entry {tid=}"
+ guests.append(nickname_used)
+ if known_foreigner(nickname_used):
+ message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
print(message)
else:
- message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this year."
+ message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
res.append((personyear, nickname_used, logtime_underground))
@@ -170,10 +172,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
raise
if author_u:
author = personyear
- else:
- # a person but with * prefix. Ignored everywhere.
- # print(f" ! - {expedition.year} * person : {tripperson}")
- pass
+ else: # *guest
+ guests.append(tripperson)
+ # print(f" ! - {expedition.year} * GUEST : {tripperson}")
if not author:
if not res:
@@ -181,7 +182,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
author = res[-1][0] # the previous valid person and a time of 0 hours
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
- return res, author
+ return res, author, guests
def tidy_time_underground(logtime_underground):
# Nasty hack, must tidy this up..
@@ -202,7 +203,7 @@ def tidy_time_underground(logtime_underground):
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
try:
- trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
+ trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
except:
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
@@ -216,7 +217,7 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
- return trippersons, author
+ return trippersons, author, guests
def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though?
@@ -251,16 +252,18 @@ def tidy_tid(tid, title):
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid):
"""saves a single logbook entry and related personlogentry items
We could do a bulk update to save all the entries, but then we would need to do a query on
- each one to get the primary key to asign to the PersonLogEntries. So overall probably not much
+ each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
faster ?
"""
+ other_people = ", ".join(guests) # join list members separated by comma
nonLookupAttribs = {
"place": place,
+ "other_people": other_people, # *Ol's Mum, foreigners..
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
@@ -324,6 +327,17 @@ def parser_date(tripdate, year):
def parser_html(year, expedition, txt, seq=""):
"""This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
+
+ e.g.
+ * is greedy
+ *? is non-greedy
+
+ (?x) flag means VERBOSE
+
+ (?: ) non-capturing parentheses
+
+ \s whitespace
+ \S NOT whitespace
You can't see it here, but a round-trip export-then-import will move
the endmatter up to the frontmatter. This made sense when translating
@@ -357,7 +371,7 @@ def parser_html(year, expedition, txt, seq=""):
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_seq_id(year, logbook_entry_count)
- # print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
+ # print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
s = re.match(
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
@@ -367,15 +381,17 @@ def parser_html(year, expedition, txt, seq=""):
\s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
+ \s*(?:<div\s+class="editentry"\s*.*?</div>)?
\s*$
""",
trippara,
)
if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
+ # print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
else:
# if not re.search(r"Rigging Guide", trippara):
- msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
+ msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
print(msg)
DataIssue.objects.create(parser="logbooks", message=msg)
continue
@@ -403,12 +419,12 @@ def parser_html(year, expedition, txt, seq=""):
dupl[check] = 1
tu = tidy_time_underground(tu)
- trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
+ trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
tid = tidy_tid(tid, triptitle)
- entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+ entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
logentries.append(entrytuple)
return logentries
@@ -509,13 +525,13 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
logtime_underground = 0
- trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
+ trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
# print(f" - author: {author}")
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, year)
tid = tidy_tid(tid, triptitle)
- entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+ entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
logentries.append(entrytuple)
return logentries
@@ -621,10 +637,10 @@ def LoadLogbook(year):
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entrytuple in logentries:
- date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+ date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
if expo == expedition: # unneeded check, we zeroed it before filling it
# print(f" -- {triptitle}")
- store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+ store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
else:
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
expo.save() # to save logbook name property
@@ -708,8 +724,8 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.)
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
- date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
- store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+ date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
+ store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
for expo in expos:
expo.save() # to save logbook name property