import os
import re
import sys
import string
import time
from datetime import date, datetime
from pathlib import Path
from random import randint
from django.conf import settings
from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
from troggle.core.models.caves import GetCaveLookup
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import get_process_memory, alphabet_suffix, unique_slug
"""
Parses and imports logbooks in all their wonderful confusion
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
"""
todo = """
- check cross-references in other logbooks and other HTML frahments
e.g. cave descriptions
- Most of the time is during the database writing (6s out of 8s).
- profile the code to find bad repetitive things, of which there are many.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
- replace explicit 1970 date with a constant EPOCH
- rewrite to use generators rather than storing everything intermediate in lists - to
reduce memory impact [low priority]
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
}
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years now (Jan.2023) use the default value for Logbook parser
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"),
}
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
ENTRIES = {
"2023": 81,
"2022": 93,
"2019": 55,
"2018": 95,
"2017": 74,
"2016": 86,
"2015": 80,
"2014": 67,
"2013": 52,
"2012": 76,
"2011": 71,
"2010": 22,
"2009": 53,
"2008": 49,
"2007": 113,
"2006": 60,
"2005": 55,
"2004": 76,
"2003": 42,
"2002": 31,
"2001": 49,
"2000": 54,
"1999": 79,
"1998": 43,
"1997": 53,
"1996": 95,
"1995": 42,
"1994": 32,
"1993": 41,
"1992": 62,
"1991": 39,
"1990": 87,
"1989": 63,
"1988": 61,
"1987": 34,
"1985": 24,
"1984": 32,
"1983": 52,
"1982": 42,
# "1979": 30, # to be hand-edited
"1978": 38,
}
# What about 1970s ! Yes, 80 and 81 are missing, so are 1976 and 1977.
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
tripsdate = {}
def set_trip_seq_id(year, seq):
'''We have not parsed the trip date yet, so this is a sequence number
'''
tid = f"{year}_s{seq:02d}"
return tid
def reset_trip_id(date):
'''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
of , i.e. '2003-07-30b'
BUT this gets re-set every time the logbook is imported,
However these are persistent as the entries are ordered on this field.
'''
already =tripsdate.get(date, 0) # returns zero if none found
n = already + 1
tripsdate[date] = n
suffix = alphabet_suffix(n)
tid = f"{date}{suffix}"
# print(tid)
return tid
rx_tripperson = re.compile(r"(?i)(.*?)$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = []
author = None
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
# author_u = re.match(r"(?i)(.*?)$", tripperson)
author_u = rx_tripperson.match(tripperson)
if author_u:
tripperson = author_u.group(1).strip()
if tripperson:
if tripperson[0] != "*": # a name prefix of "*" is special
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
# Whacky aliases all resolved in GetPersonExpeditionNameLookup()
nickname_used = tripperson
try:
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
if known_foreigner(tripperson):
message = f" ! - {expedition.year} Known foreigner: '{tripperson}' in entry {tid=}"
print(message)
else:
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this year."
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
res.append((personyear, nickname_used, logtime_underground))
except:
# This should not happen. We do not raise exceptions in that function
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
raise
if author_u:
author = personyear
else:
# a person but with * prefix. Ignored everywhere.
# print(f" ! - {expedition.year} * person : {tripperson}")
pass
if not author:
if not res:
return "", 0
author = res[-1][0] # the previous valid person and a time of 0 hours
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
return res, author
def tidy_time_underground(logtime_underground):
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
return logtime_underground
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
except:
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
raise
return "", ""
if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return trippersons, author
def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though?
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
lplace = place.lower()
cave = None
if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace)
return cave
def tidy_trip_image_urls(text, date):
y = str(date)[:4]
text = text.replace(' src="', f' src="/years/{y}/')
text = text.replace(" src='", f" src='/years/{y}/")
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
def tidy_tid(tid, title):
if tid is not None:
return tid
# print(f"! {title=} ")
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
"""saves a single logbook entry and related personlogentry items
We could do a bulk update to save all the entries, but then we would need to do a query on
each one to get the primary key to asign to the PersonLogEntries. So overall probably not much
faster ?
"""
nonLookupAttribs = {
"place": place,
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
}
lookupAttribs = {"slug": tid, "date": date, "title": title}
if LogbookEntry.objects.filter(slug=tid).exists():
# oops. Our code should already have ensured this is unique.
message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
DataIssue.objects.create(parser="logbooks", message=message)
slug = slug + "_" + unique_slug(text,2)
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
pt_list = []
for tripperson, nickname_used, time_underground in trippersons:
lookupAttribs = {"personexpedition": tripperson, "nickname_used": nickname_used, "logbook_entry": lbo} # lbo is primary key
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
pt_list.append(PersonLogEntry(**nonLookupAttribs, **lookupAttribs))
PersonLogEntry.objects.bulk_create(pt_list)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
Does NOT actually check that it is a truly valid date..
"""
dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1
day = 1
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
# print(message)
try:
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return dummydate
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser="logbooks", message=message)
return dummydate
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
year = 1970 # replace with _EPOCH
message = f" ! - Bad date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return date(year, month, day)
except:
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return datetime.date(1970, 1, 1) # replace with _EPOCH
def parser_html(year, expedition, txt, seq=""):
"""This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
You can't see it here, but a round-trip export-then-import will move
the endmatter up to the frontmatter. This made sense when translating
from parser_html_01 format logfiles, believe me.
"""
logentries = []
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*]*>(.*?) 0:
frontpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "frontmatter.html")
with open(frontpath, "w") as front:
front.write(headpara + "\n")
# extract END material and stash for later use when rebuilding from list of entries
endmatch = re.match(r"(?i)(?s).*([\s\S]*?)(?= 0:
print(f"\n - {year} endpara:\n'{endpara}'")
endpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "endmatter.html")
with open(endpath, "w") as end:
end.write(endpara + "\n")
tripparas = re.findall(r"([\s\S]*?)(?=.*?\s*
)? # second date
\s*(?:\s*)?
\s*
(.*?)
(?:
)?
\s*
\s*(.*?)
\s*
\s*(.*?)
([\s\S]*?)
\s*(?:
\s*(.*?)
)?
\s*$
""",
trippara,
)
if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
else:
# if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
print(msg)
DataIssue.objects.create(parser="logbooks", message=msg)
continue
ldate = parser_date(tripdate.strip(), year)
# Now we have a date, we can reset tripid
tid = reset_trip_id(ldate)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
place = triptitles[0]
else:
place = "Unknown"
tripcontent = re.sub(r"
", "", triptext)
tripcontent = re.sub(r"
", "
", tripcontent).strip()
triptitle = triptitle.strip()
# triptitle must be unique for a given date. [Why?!] We fix this here.
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {ldate}")
else:
dupl[check] = 1
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
tid = tidy_tid(tid, triptitle)
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
return logentries
def parser_blog(year, expedition, txt, sq=""):
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
https://expo.survex.com/handbook/computing/log-blog-parsing.html
This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
BLOG entries have this structure:
So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
"""
logentries = []
tripheads = re.findall(
# note use of non-greedy capturing (?: regex idiom here
r"\s*([\s\S]*?)(]*>)([\s\S]*?)(?=[\s\S]*?(?=)", "", attach)
attach = re.sub(r"