diff options
author | Philip Sargent <philip.sargent@gmail.com> | 2023-09-02 17:49:37 +0300 |
---|---|---|
committer | Philip Sargent <philip.sargent@gmail.com> | 2023-09-02 17:49:37 +0300 |
commit | 1a8bc17f806d88b06aeabc28c11a6da199216fe2 (patch) | |
tree | f02f5038db4b02d78cf5fbf09bea91bb3a287966 /parsers/logbooks.py | |
parent | c9729c046ccccfd5e858f1fe8fbf24619d156e30 (diff) | |
download | troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.gz troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.tar.bz2 troggle-1a8bc17f806d88b06aeabc28c11a6da199216fe2.zip |
Fixed parsers
Diffstat (limited to 'parsers/logbooks.py')
-rw-r--r-- | parsers/logbooks.py | 66 |
1 files changed, 41 insertions, 25 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index fdc68ad..ad92931 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -60,8 +60,8 @@ LOGBOOK_PARSER_SETTINGS = { LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB ENTRIES = { - "2023": 81, - "2022": 93, + "2023": 83, + "2022": 94, "2019": 55, "2018": 95, "2017": 74, @@ -127,7 +127,7 @@ def reset_trip_id(date): suffix = alphabet_suffix(n) tid = f"{date}{suffix}" - # print(tid) + # print(already, n, tid) return tid rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$") @@ -136,6 +136,7 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]") def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): res = [] author = None + guests = [] # print(f'# {tid}') # print(f" - {tid} '{trippeople}' ") @@ -154,11 +155,12 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): try: personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - if known_foreigner(tripperson): - message = f" ! - {expedition.year} Known foreigner: '{tripperson}' in entry {tid=}" + guests.append(nickname_used) + if known_foreigner(nickname_used): + message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}" print(message) else: - message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this year." + message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year." print(message) DataIssue.objects.create(parser="logbooks", message=message) res.append((personyear, nickname_used, logtime_underground)) @@ -170,10 +172,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): raise if author_u: author = personyear - else: - # a person but with * prefix. Ignored everywhere. - # print(f" ! - {expedition.year} * person : {tripperson}") - pass + else: # *guest + guests.append(tripperson) + # print(f" ! - {expedition.year} * GUEST : {tripperson}") if not author: if not res: @@ -181,7 +182,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): author = res[-1][0] # the previous valid person and a time of 0 hours # print(f" - {tid} [{author.person}] '{res[0][0].person}'...") - return res, author + return res, author, guests def tidy_time_underground(logtime_underground): # Nasty hack, must tidy this up.. @@ -202,7 +203,7 @@ def tidy_time_underground(logtime_underground): def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid): try: - trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) + trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) # trippersons is a list of tuples (personyear, nickname_used, logtime_underground) except: message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname" @@ -216,7 +217,7 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid): DataIssue.objects.create(parser="logbooks", message=message) print(message) - return trippersons, author + return trippersons, author, guests def tidy_trip_cave(place): # GetCaveLookup() need to work better. None of this data is *used* though? @@ -251,16 +252,18 @@ def tidy_tid(tid, title): tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_") return tid -def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid): +def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid): """saves a single logbook entry and related personlogentry items We could do a bulk update to save all the entries, but then we would need to do a query on - each one to get the primary key to asign to the PersonLogEntries. So overall probably not much + each one to get the primary key to assign to the PersonLogEntries. So overall probably not much faster ? """ + other_people = ", ".join(guests) # join list members separated by comma nonLookupAttribs = { "place": place, + "other_people": other_people, # *Ol's Mum, foreigners.. "text": text, "expedition": expedition, "time_underground": logtime_underground, @@ -324,6 +327,17 @@ def parser_date(tripdate, year): def parser_html(year, expedition, txt, seq=""): """This uses some of the more obscure capabilities of regular expressions, see https://docs.python.org/3/library/re.html + + e.g. + * is greedy + *? is non-greedy + + (?x) flag means VERBOSE + + (?: ) non-capturing parentheses + + \s whitespace + \S NOT whitespace You can't see it here, but a round-trip export-then-import will move the endmatter up to the frontmatter. This made sense when translating @@ -357,7 +371,7 @@ def parser_html(year, expedition, txt, seq=""): for trippara in tripparas: logbook_entry_count += 1 tid = set_trip_seq_id(year, logbook_entry_count) - # print(f' - new tid:{tid} lbe count: {logbook_entry_count}') + # print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}') s = re.match( r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date @@ -367,15 +381,17 @@ def parser_html(year, expedition, txt, seq=""): \s*<div\s+class="triptitle">\s*(.*?)</div> ([\s\S]*?) \s*(?:<div\s+class="timeug">\s*(.*?)</div>)? + \s*(?:<div\s+class="editentry"\s*.*?</div>)? \s*$ """, trippara, ) if s: tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() + # print(f"#{logbook_entry_count} {tu} {len(triptext)} ") else: # if not re.search(r"Rigging Guide", trippara): - msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" + msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..." print(msg) DataIssue.objects.create(parser="logbooks", message=msg) continue @@ -403,12 +419,12 @@ def parser_html(year, expedition, txt, seq=""): dupl[check] = 1 tu = tidy_time_underground(tu) - trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid) + trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid) tripcave = tidy_trip_cave(place) tripcontent = tidy_trip_image_urls(tripcontent, ldate) tid = tidy_tid(tid, triptitle) - entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) + entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid) logentries.append(entrytuple) return logentries @@ -509,13 +525,13 @@ def parser_blog(year, expedition, txt, sq=""): tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent logtime_underground = 0 - trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid) + trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid) # print(f" - author: {author}") tripcave = tidy_trip_cave(place) tripcontent = tidy_trip_image_urls(tripcontent, year) tid = tidy_tid(tid, triptitle) - entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) + entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid) logentries.append(entrytuple) return logentries @@ -621,10 +637,10 @@ def LoadLogbook(year): f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" ) for entrytuple in logentries: - date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple + date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple if expo == expedition: # unneeded check, we zeroed it before filling it # print(f" -- {triptitle}") - store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid) + store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid) else: print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) expo.save() # to save logbook name property @@ -708,8 +724,8 @@ def LoadLogbooks(): # - LogBookEntry (text, who when etc.) # - PersonLogEntry (who was on that specific trip mentione din the logbook entry) for entrytuple in allentries: - date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple - store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid) + date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple + store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid) for expo in expos: expo.save() # to save logbook name property |