From 0054bedc4d290102bb839f81da9dc3a44e3bf679 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 22 Sep 2023 11:20:24 +0200 Subject: [PATCH] treat %XX URL sequences as field stop in indexing, do not index words > 128 characters --- handlers/40_search.sh | 7 +++++-- parsers/40_indexer.sh | 12 +++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/handlers/40_search.sh b/handlers/40_search.sh index 3db8987..c558f53 100644 --- a/handlers/40_search.sh +++ b/handlers/40_search.sh @@ -5,14 +5,17 @@ . "$_EXEC/cgilite/storage.sh" I="$_DATA/index" -words="$(GET q |tr '] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-' ' ')" +words="$( GET q | sed -E ' + :X $bY; N; bX; :Y + s;([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[1-9A-Fa-f]{2})+; ;g +')" for w in ${words}; do [ ! -f "$I/$w" ] && continue while read date doc freq num total; do P="$_DATA/pages$(UNSTRING "$doc")" - d="$(stat -c %Y -- "$P/#page.md")" + d="$(stat -c %Y -- "$P/#index.flag")" [ "$d" -gt "$date" ] && continue printf '%s %f\n' "$doc" "$freq" diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index d5783f4..3cfc72c 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -14,10 +14,9 @@ # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR # IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -DOC="${PATH_INFO%/}/" DOC="${DOC%/\[*\]/}" DOC="${DOC%/}/" -P="$_DATA/pages${DOC}" I="$_DATA/index/" +DOC="${PATH_INFO%/}/" P="$_DATA/pages${DOC}" I="$_DATA/index/" -if [ -f "$P/#index.flag" -a ! "$P/#page.md" -nt "$P/#index.flag" ]; then +if [ -f "$P/#index.flag" -a ! "$P/#page.md" -nt "$P/#index.flag" ] || [ ! -d "$P" ]; then cat exit 0 fi @@ -35,8 +34,11 @@ mkdir -p "$I" printf '%s\n' "$line" done \ | awk ' - BEGIN { FS = "[] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]+" } - { for (n = 1; n <= NF; n++) if ( $n != "" ) { words[tolower($n)]++; total++; } } + BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" } + { for (n = 1; n <= NF; n++) { + if ( $n != "" && length($n) <= 128 ) { + words[tolower($n)]++; total++; + } } } END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; } ' \ | while read num total freq word; do -- 2.39.2