From e16cbbded1805c1cd2256b7679bd299dc4746579 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 22 Sep 2023 15:26:12 +0200 Subject: [PATCH] separate word index by unicode punctuation --- handlers/40_search.sh | 12 +++++++++--- parsers/40_indexer.sh | 8 +++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/handlers/40_search.sh b/handlers/40_search.sh index c558f53..c9bc627 100644 --- a/handlers/40_search.sh +++ b/handlers/40_search.sh @@ -5,9 +5,15 @@ . "$_EXEC/cgilite/storage.sh" I="$_DATA/index" -words="$( GET q | sed -E ' - :X $bY; N; bX; :Y - s;([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[1-9A-Fa-f]{2})+; ;g +words="$( GET q | awk ' + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } + { for (n = 1; n <= NF; n++) printf "%s ", $n; } ')" for w in ${words}; do diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index ec7be4a..e3065c1 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -34,7 +34,13 @@ mkdir -p "$I" printf '%s\n' "$line" done \ | awk ' - BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" } + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } { for (n = 1; n <= NF; n++) { if ( $n != "" && length($n) <= 128 ) { words[tolower($n)]++; total++; -- 2.39.2