From: Paul Hänsch Date: Fri, 22 Sep 2023 13:26:12 +0000 (+0200) Subject: separate word index by unicode punctuation X-Git-Url: https://git.plutz.net/?a=commitdiff_plain;h=e16cbbded1805c1cd2256b7679bd299dc4746579;p=shellwiki separate word index by unicode punctuation --- diff --git a/handlers/40_search.sh b/handlers/40_search.sh index c558f53..c9bc627 100644 --- a/handlers/40_search.sh +++ b/handlers/40_search.sh @@ -5,9 +5,15 @@ . "$_EXEC/cgilite/storage.sh" I="$_DATA/index" -words="$( GET q | sed -E ' - :X $bY; N; bX; :Y - s;([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[1-9A-Fa-f]{2})+; ;g +words="$( GET q | awk ' + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } + { for (n = 1; n <= NF; n++) printf "%s ", $n; } ')" for w in ${words}; do diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index ec7be4a..e3065c1 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -34,7 +34,13 @@ mkdir -p "$I" printf '%s\n' "$line" done \ | awk ' - BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" } + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } { for (n = 1; n <= NF; n++) { if ( $n != "" && length($n) <= 128 ) { words[tolower($n)]++; total++;