]> git.plutz.net Git - shellwiki/commitdiff
separate word index by unicode punctuation
authorPaul Hänsch <paul@plutz.net>
Fri, 22 Sep 2023 13:26:12 +0000 (15:26 +0200)
committerPaul Hänsch <paul@plutz.net>
Fri, 22 Sep 2023 13:26:12 +0000 (15:26 +0200)
handlers/40_search.sh
parsers/40_indexer.sh

index c558f5394b9cd01b8a86063d69a291de094b7fbc..c9bc62738c985f8db46b8a4326f5eac018c08613 100644 (file)
@@ -5,9 +5,15 @@
 . "$_EXEC/cgilite/storage.sh"
 
 I="$_DATA/index"
-words="$( GET q | sed -E '
-  :X $bY; N; bX; :Y
-  s;([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[1-9A-Fa-f]{2})+;     ;g
+words="$( GET q | awk '
+  BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+          if ( length("¡") == 1 )  # Utf-8 aware AWK
+          FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+          else                     # UTF-8 Hack
+          FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+          fi
+        }
+       { for (n = 1; n <= NF; n++) printf "%s  ", $n; }
 ')"
 
 for w in ${words}; do
index ec7be4a7f4a6d9fa17da3b19ed762c814ce9b78b..e3065c14aa11b8c5c82b357188f008ce48223e3a 100755 (executable)
@@ -34,7 +34,13 @@ mkdir -p "$I"
   printf '%s\n' "$line"
 done \
 | awk '
-  BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" }
+  BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+          if ( length("¡") == 1 )  # Utf-8 aware AWK
+          FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+          else                     # UTF-8 Hack
+          FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+          fi
+        }
         { for (n = 1; n <= NF; n++) {
             if ( $n != "" && length($n) <= 128 ) {
               words[tolower($n)]++; total++;