diff options
author | Philip Sargent <philip.sargent@gmail.com> | 2025-01-09 21:59:27 +0000 |
---|---|---|
committer | Philip Sargent <philip.sargent@gmail.com> | 2025-01-09 21:59:27 +0000 |
commit | 219b8b792e2a6e1fb72b9e658b06395a50292e59 (patch) | |
tree | a9ba93662e00af43afff3c4cb5ba6ad386e4ca8b | |
parent | 5b97cd83dd92ff506a40ac784d816a0be4bcc4eb (diff) | |
download | troggle-219b8b792e2a6e1fb72b9e658b06395a50292e59.tar.gz troggle-219b8b792e2a6e1fb72b9e658b06395a50292e59.tar.bz2 troggle-219b8b792e2a6e1fb72b9e658b06395a50292e59.zip |
AI comments on regexes
-rw-r--r-- | parsers/logbooks.py | 12 | ||||
-rw-r--r-- | parsers/survex.py | 54 |
2 files changed, 66 insertions, 0 deletions
diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 2ede83f..3d96b3b 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -139,6 +139,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): # print(f'# {tid}') # print(f" - {tid} '{trippeople}' ") + """ + re.split(r",|\+|&|&(?!\w+;)| and ", trippeople) + + , : The comma character + \+ : The plus sign (+); escaped to treat as a literal character + & : The literal string "&" (HTML-encoded ampersand) + &(?!\w+;) : An ampersand (&) not followed by one or more word characters (\w+) and a semicolon (;) + : Uses negative lookahead assertion (?!...) to ensure it's not part of an HTML entity like " " + and : The literal string " and " (with spaces before and after) + + This will split the 'trippeople' string at any of these delimiters. + """ for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() # author_u = re.match(r"(?i)<u>(.*?)</u>$", tripperson) diff --git a/parsers/survex.py b/parsers/survex.py index 5338148..5f601e1 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -298,7 +298,61 @@ class LoadingSurvex: rx_commteam = re.compile(r"(?i)\s*(Messteam|Zeichner)\s*[:]?(.*)") rx_quotedtitle = re.compile(r'(?i)^"(.*)"$') + """ + Regular expression explanation for rx_starref (MS CoPilot) + + (?i) : Case-insensitive flag for the regex + ^ : Asserts the position at the start of a line + \s* : Matches zero or more whitespace characters + \*ref : Matches the literal string "*ref" + [\s.:]* : Matches zero or more whitespace characters, periods, or colons + + ((?:19[6789]\d)|(?:20[0123]\d)) + : Capturing group that matches a year in the 1960s-1990s or 2000s-2030s + : (?:...) is a non-capturing group + : 19[6789]\d matches years from 1960 to 1999 + : 20[0123]\d matches years from 2000 to 2039 + + \s* : Matches zero or more whitespace characters + #? : Matches zero or one "#" character + \s* : Matches zero or more whitespace characters + + (X)? : Capturing group that optionally matches the character "X" + \s* : Matches zero or more whitespace characters + + (.*?\d+.*?) : Capturing group that matches any character sequence containing at least one digit + : .*? matches any character (except newline), as few times as possible (non-greedy) + : \d+ matches one or more digits + : .*? matches any character (except newline), as few times as possible (non-greedy) + + $ : Asserts the position at the end of a line + Regular expression explanation for rx_argsref + + (?i) : Case-insensitive flag for the regex + ^ : Asserts the position at the start of a line + [\s.:]* : Matches zero or more whitespace characters, periods, or colons + + ((?:19[6789]\d)|(?:20[012345]\d)) + : Capturing group that matches a year in the 1960s-1990s or 2000s-2050s + : (?:...) is a non-capturing group + : 19[6789]\d matches years from 1960 to 1999 + : 20[012345]\d matches years from 2000 to 2059 + + \s* : Matches zero or more whitespace characters + #? : Matches zero or one "#" character + \s* : Matches zero or more whitespace characters + + (X)? : Capturing group that optionally matches the character "X" + \s* : Matches zero or more whitespace characters + + (.*?\d+.*?) : Capturing group that matches any character sequence containing at least one digit + : .*? matches any character (except newline), as few times as possible (non-greedy) + : \d+ matches one or more digits + : .*? matches any character (except newline), as few times as possible (non-greedy) + + $ : Asserts the position at the end of a + """ # This interprets the survex "*data normal" command which sets out the order of the fields in the data, e.g. |