diff options
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 46 |
1 files changed, 38 insertions, 8 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 466414c..a5f6631 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,6 +1,7 @@ import os import re import sys +import string import time from datetime import date, datetime @@ -15,6 +16,7 @@ from troggle.core.models.caves import GetCaveLookup from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry from troggle.core.models.troggle import DataIssue, Expedition from troggle.core.utils import get_process_memory +from troggle.core.views.uploads import unique_slug """ Parses and imports logbooks in all their wonderful confusion @@ -106,11 +108,31 @@ ENTRIES = { logentries = [] # the entire logbook for one year is a single object: a list of entries noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] +tripsdate = {} +alphabet = [] -def set_trip_id(year, seq): +def set_trip_seq_id(year, seq): + '''We have not parsed the trip date yet, so this is a sequence numer + ''' tid = f"{year}_s{seq:02d}" return tid +def reset_trip_id(date): + '''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form + of <date><letter>, i.e. '2003-07-30b' + BUT this gets re-set every time the logbook is imported, + so they are not persistent as we would much prefer. + ''' + global alphabet + already =tripsdate.get(date, 0) # returns zero if none found + tripsdate[date] = already +1 + if not alphabet: + alphabet = list(string.ascii_lowercase) + + tid = f"{date}{alphabet[already]}" + # print(tid) + return tid + rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$") rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]") @@ -246,9 +268,14 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a "expedition": expedition, "time_underground": logtime_underground, "cave_slug": str(tripcave), - "slug": tid, } - lookupAttribs = {"date": date, "title": title} + lookupAttribs = {"slug": tid, "date": date, "title": title} + if LogbookEntry.objects.filter(slug=tid).exists(): + # oops. + message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug + DataIssue.objects.create(parser="logbooks", message=message) + slug = slug + "_" + unique_slug(text,2) + lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs) pt_list = [] @@ -332,7 +359,7 @@ def parser_html(year, expedition, txt, seq=""): logbook_entry_count = 0 for trippara in tripparas: logbook_entry_count += 1 - tid = set_trip_id(year, logbook_entry_count) + tid = set_trip_seq_id(year, logbook_entry_count) # print(f' - new tid:{tid} lbe count: {logbook_entry_count}') s = re.match( @@ -376,6 +403,9 @@ def parser_html(year, expedition, txt, seq=""): continue ldate = parser_date(tripdate.strip(), year) + + # Now we have a date, we can reset tripid + tid = reset_trip_id(ldate) triptitles = triptitle.split(" - ") if len(triptitles) >= 2: place = triptitles[0] @@ -385,7 +415,7 @@ def parser_html(year, expedition, txt, seq=""): tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip() triptitle = triptitle.strip() - # triptitle must be unique for a given date. We fix this here. + # triptitle must be unique for a given date. We fix this here. [Why?!] check = (ldate, triptitle) if check in dupl: dupl[check] += 1 @@ -458,7 +488,7 @@ def parser_blog(year, expedition, txt, sq=""): # print(f"{i} - {len(tripstuff)} - {tripstuff[1]}") triphead = tripheads[i] logbook_entry_count += 1 - tid = set_trip_id(year, logbook_entry_count) + "_blog" + sq + tid = set_trip_seq_id(year, logbook_entry_count) + "_blog" + sq # print(f" - tid: {tid}") # data-author="tcacrossley" @@ -580,7 +610,7 @@ def parse_logbook_for_expedition(expedition, blog=False): if logbook_parseable: # -------------------- parser = globals()[parsefunc] - print(f" - {year} parsing with {parsefunc} - {lb}") + # print(f" - {year} parsing with {parsefunc} - {lb}") print(" .", end="") logentries = parser(year, expedition, txt, sq) # this launches the right parser # -------------------- @@ -595,7 +625,7 @@ def parse_logbook_for_expedition(expedition, blog=False): def LoadLogbook(year): - """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload' + """One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload' This is inside an atomic transaction""" expo = Expedition.objects.get(year=year) |