Updates to make 2018 blog merge work (faster)

author: Philip Sargent <philip.sargent@gmail.com> 2022-12-19 20:13:26 +0000
committer: Philip Sargent <philip.sargent@gmail.com> 2022-12-19 20:13:26 +0000
commit: bb14c94ab10cbd279586c97822372bba8375b67b (patch)
tree: 01b251b8290e8f5a6b2784a25ca71157cbc21f88
parent: 7e9bb737771bb031d7db7864a5267b75da8e08c0 (diff)
download: troggle-bb14c94ab10cbd279586c97822372bba8375b67b.tar.gz
troggle-bb14c94ab10cbd279586c97822372bba8375b67b.tar.bz2
troggle-bb14c94ab10cbd279586c97822372bba8375b67b.zip
5 files changed, 52 insertions, 18 deletions
diff --git a/core/models/survex.py b/core/models/survex.py
index cc9b9f8..5d60e62 100644
--- a/core/models/survex.py
+++ b/core/models/survex.py
@@ -171,7 +171,8 @@ class Wallet(models.Model):
     '''
     fpath               = models.CharField(max_length=200)
     walletname          = models.CharField(max_length=200)
-    
+    walletdate          = models.DateField(blank=True, null=True)
+   
     class Meta:
         ordering = ('walletname',)
     
@@ -238,6 +239,8 @@ class Wallet(models.Model):
 
     # Yes this is horribly, horribly inefficient, esp. for a page that have date, people and cave in it
     def date(self):
+        if self.walletdate:
+            return self.walletdate
         if not self.get_json():
             return None
         jsondata = self.get_json()
@@ -254,7 +257,9 @@ class Wallet(models.Model):
                     samedate = datetime.date.fromisoformat(datestr[:10])
                 except:
                     samedate = None
-            return samedate.isoformat()
+            self.walletdate = samedate.isoformat()
+            self.save()
+            return self.walletdate
         
     def people(self):
         if not self.get_json():
diff --git a/core/views/logbooks.py b/core/views/logbooks.py
index 517a48b..52e2d11 100644
--- a/core/views/logbooks.py
+++ b/core/views/logbooks.py
@@ -1,4 +1,5 @@
 import datetime
+import time
 import os.path
 import re
 
@@ -186,24 +187,26 @@ def personexpedition(request, first_name='',  last_name='', year=''):
 
 
 def logbookentry(request, date, slug):
-    this_logbookentry = LogbookEntry.objects.filter(date=date, slug=slug)
+    # start = time.time()
+    trips = LogbookEntry.objects.filter(date=date) # all the trips not just this one
+    this_logbookentry = trips.filter(date=date, slug=slug)
     
     if this_logbookentry:
         if len(this_logbookentry)>1:
             return render(request, 'object_list.html',{'object_list':this_logbookentry})
         else:
-            trips = LogbookEntry.objects.filter(date=date)
             wallets = set()
-            refwallets = Wallet.objects.filter(survexblock__date=date)
+            allwallets = Wallet.objects.all()
+            refwallets = allwallets.filter(survexblock__date=date)
             for r in refwallets:
                 wallets.add(r)
-            
-            allwallets = Wallet.objects.all()
+           
             # Note that w.year() only works for wallets which have a valid JSON file existing
-            for w in allwallets:
-                if w.date() == date:
-                    wallets.add(w)
-                
+            # This is very slow with a big lag as w.date() is a computed field
+            # Noticably slow with WSL2 and NTFS filesystem, even with caching as walletdate.
+            jwallets = allwallets.filter(walletdate=date)
+            for j in jwallets:
+                wallets.add(j)
             thisexpo = this_expedition = Expedition.objects.get(year=int(date[0:4]))
             if thisexpo:
                 expeditionday = thisexpo.get_expedition_day(date)
@@ -214,6 +217,8 @@ def logbookentry(request, date, slug):
             this_logbookentry=this_logbookentry[0]
             # This is the only page that uses presontrip_next and persontrip_prev
             # and it is calculated on the fly in the model 
+            # duration = time.time()-start
+            # print(f"--- Render after {duration:.2f} seconds")
             return render(request, 'logbookentry.html', 
                 {'logbookentry': this_logbookentry, 'trips': trips, 'svxothers': svxothers, 'wallets': wallets})
     else:
diff --git a/parsers/imports.py b/parsers/imports.py
index a253964..47d0c4c 100644
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -41,9 +41,8 @@ def import_logbooks():
     with transaction.atomic():
         troggle.parsers.logbooks.LoadLogbooks()
 
-def import_logbook(year=2022):
+def import_logbook(year=2018):
     print(f"-- Importing Logbook {year}")
-    print(f"-- - commented out")
     with transaction.atomic():
         troggle.parsers.logbooks.LoadLogbook(year)
 
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index e37780c..d194a5e 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -51,7 +51,7 @@ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
 BLOG_PARSER_SETTINGS = {
 #                "2022": ("ukcavingblog.html", "parser_blog"), 
                 "2019": ("ukcavingblog.html", "parser_blog"), 
-                "2018": ("ukcavingblog.html", "parser_blog"), 
+#                "2018": ("ukcavingblog.html", "parser_blog"), 
 #               "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
             }
 DEFAULT_LOGBOOK_FILE = "logbook.html"
@@ -83,7 +83,7 @@ LOGBOOK_PARSER_SETTINGS = {
                 "1982": ("log.htm", "parser_html_01"), 
             }
 
-entries = { "2022": 86, "2019": 56, "2018": 86, "2017": 76, "2016": 83, "2015": 79, 
+entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": 79, 
     "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, 
     "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
     "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, 
@@ -138,6 +138,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                 tripperson = "Nadia"
             if tripperson =="tcacrossley":
                 tripperson = "Tom Crossley"
+            if tripperson =="Samouse1":
+                tripperson = "Todd Rye"
+                
 
                             
             personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
@@ -497,6 +500,13 @@ def parser_blog(year, expedition, txt, sq=""):
     
     This uses some of the more obscure capabilities of regular expressions,
     see https://docs.python.org/3/library/re.html
+    
+    BLOG entries have this structure:
+        <article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
+            <article class="message-body js-selectToQuote">
+            </article>
+        </article>    
+    So the content is nested inside the header. Attachments (images) come after the content.
     '''
     global logentries
     global logdataissues
@@ -508,19 +518,26 @@ def parser_blog(year, expedition, txt, sq=""):
         print(message)
 
     # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
-    tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
+    tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt)
     if not ( tripparas ) :
         message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
         print(message)
         
     if (len(tripheads) !=len(tripparas)):
         print(f"{len(tripheads)} != {len(tripparas)}")
+    print(f"{len(tripheads)} - {len(tripparas)}")
 
     location = "Plateau" # best guess, fix manually later
     tu = 0
     logbook_entry_count = 0
     for i in range(0, len(tripparas)):
-        tripcontent = tripparas[i]
+        tripstuff = tripparas[i]
+        attach = tripstuff[2]
+        # note use on non-greedy *? regex idiom here
+        attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)","",attach)
+        attach = re.sub(r"<footer[\s\S]*(</footer>)","",attach)
+        tripcontent = tripstuff[0] + attach
+        #print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
         triphead = tripheads[i]
         logbook_entry_count += 1
         tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
@@ -684,8 +701,15 @@ def LoadLogbook(year):
     nlbe={}
     TROG['pagecache']['expedition'][year] = None # clear cache
     
-    expo = Expedition.objects.get(year=year)    
+    expo = Expedition.objects.get(year=year)   
+    year = expo.year # some type funny
     nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+    if year in BLOG_PARSER_SETTINGS:
+        print("BLOG parsing")
+        LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year] 
+        nlbe[expo] = LoadLogbookForExpedition(expo, clean=False)  # this  loads the blog logbook for one expo
+    else:
+        print(f" {year} not in {BLOG_PARSER_SETTINGS}")
 
 def LoadLogbooks():
     """ This is the master function for parsing all logbooks into the Troggle database. 
diff --git a/templates/logbook2005style.html b/templates/logbook2005style.html
index bf5534c..0104070 100644
--- a/templates/logbook2005style.html
+++ b/templates/logbook2005style.html
@@ -4,6 +4,7 @@
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 <title>{{logbook_entries.0.expedition}} Expo Logbook</title>
 <link rel="stylesheet" href="../../css/main2.css" />
+<meta name="keywords" content="NOEDIT">
 <style>figure {font-weight: bold; font-size: small; font-family: sans-serif;font-variant-caps: small-caps;}</style>
 </head>
 <!-- Exported by troggle in this format after having been imported using a different format and a different
author	Philip Sargent <philip.sargent@gmail.com>	2022-12-19 20:13:26 +0000
committer	Philip Sargent <philip.sargent@gmail.com>	2022-12-19 20:13:26 +0000
commit	bb14c94ab10cbd279586c97822372bba8375b67b (patch)
tree	01b251b8290e8f5a6b2784a25ca71157cbc21f88
parent	7e9bb737771bb031d7db7864a5267b75da8e08c0 (diff)
download	troggle-bb14c94ab10cbd279586c97822372bba8375b67b.tar.gz troggle-bb14c94ab10cbd279586c97822372bba8375b67b.tar.bz2 troggle-bb14c94ab10cbd279586c97822372bba8375b67b.zip