summaryrefslogtreecommitdiffstats
path: root/parsers/logbooks.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r--parsers/logbooks.py46
1 files changed, 38 insertions, 8 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 466414c..a5f6631 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,6 +1,7 @@
import os
import re
import sys
+import string
import time
from datetime import date, datetime
@@ -15,6 +16,7 @@ from troggle.core.models.caves import GetCaveLookup
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import get_process_memory
+from troggle.core.views.uploads import unique_slug
"""
Parses and imports logbooks in all their wonderful confusion
@@ -106,11 +108,31 @@ ENTRIES = {
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
+tripsdate = {}
+alphabet = []
-def set_trip_id(year, seq):
+def set_trip_seq_id(year, seq):
+ '''We have not parsed the trip date yet, so this is a sequence numer
+ '''
tid = f"{year}_s{seq:02d}"
return tid
+def reset_trip_id(date):
+ '''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
+ of <date><letter>, i.e. '2003-07-30b'
+ BUT this gets re-set every time the logbook is imported,
+ so they are not persistent as we would much prefer.
+ '''
+ global alphabet
+ already =tripsdate.get(date, 0) # returns zero if none found
+ tripsdate[date] = already +1
+ if not alphabet:
+ alphabet = list(string.ascii_lowercase)
+
+ tid = f"{date}{alphabet[already]}"
+ # print(tid)
+ return tid
+
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
@@ -246,9 +268,14 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
- "slug": tid,
}
- lookupAttribs = {"date": date, "title": title}
+ lookupAttribs = {"slug": tid, "date": date, "title": title}
+ if LogbookEntry.objects.filter(slug=tid).exists():
+ # oops.
+ message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
+ DataIssue.objects.create(parser="logbooks", message=message)
+ slug = slug + "_" + unique_slug(text,2)
+
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
pt_list = []
@@ -332,7 +359,7 @@ def parser_html(year, expedition, txt, seq=""):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
- tid = set_trip_id(year, logbook_entry_count)
+ tid = set_trip_seq_id(year, logbook_entry_count)
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
s = re.match(
@@ -376,6 +403,9 @@ def parser_html(year, expedition, txt, seq=""):
continue
ldate = parser_date(tripdate.strip(), year)
+
+ # Now we have a date, we can reset tripid
+ tid = reset_trip_id(ldate)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
place = triptitles[0]
@@ -385,7 +415,7 @@ def parser_html(year, expedition, txt, seq=""):
tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
triptitle = triptitle.strip()
- # triptitle must be unique for a given date. We fix this here.
+ # triptitle must be unique for a given date. We fix this here. [Why?!]
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
@@ -458,7 +488,7 @@ def parser_blog(year, expedition, txt, sq=""):
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
triphead = tripheads[i]
logbook_entry_count += 1
- tid = set_trip_id(year, logbook_entry_count) + "_blog" + sq
+ tid = set_trip_seq_id(year, logbook_entry_count) + "_blog" + sq
# print(f" - tid: {tid}")
# data-author="tcacrossley"
@@ -580,7 +610,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
- print(f" - {year} parsing with {parsefunc} - {lb}")
+ # print(f" - {year} parsing with {parsefunc} - {lb}")
print(" .", end="")
logentries = parser(year, expedition, txt, sq) # this launches the right parser
# --------------------
@@ -595,7 +625,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
def LoadLogbook(year):
- """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
+ """One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
This is inside an atomic transaction"""
expo = Expedition.objects.get(year=year)