From feca7d3f5bff6cd8b1437adb95686a77ab8e9367 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Thu, 21 Sep 2023 21:45:36 +0200 Subject: [PATCH] introducing text indexer --- parsers/40_indexer.sh | 60 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100755 parsers/40_indexer.sh diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh new file mode 100755 index 0000000..d5783f4 --- /dev/null +++ b/parsers/40_indexer.sh @@ -0,0 +1,60 @@ +#!/bin/sh + +# Copyright 2023 Paul Hänsch +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +DOC="${PATH_INFO%/}/" DOC="${DOC%/\[*\]/}" DOC="${DOC%/}/" +P="$_DATA/pages${DOC}" I="$_DATA/index/" + +if [ -f "$P/#index.flag" -a ! "$P/#page.md" -nt "$P/#index.flag" ]; then + cat + exit 0 +fi + +. "$_EXEC/cgilite/storage.sh" + +exec 3>&1 + +touch "$P/#index.flag" +mkdir -p "$I" + +{ cat; printf \\n; } \ +| while IFS='' read -r line; do + printf '%s\n' "$line" >&3 + printf '%s\n' "$line" +done \ +| awk ' + BEGIN { FS = "[] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]+" } + { for (n = 1; n <= NF; n++) if ( $n != "" ) { words[tolower($n)]++; total++; } } + END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; } +' \ +| while read num total freq word; do + [ "$word" ] || continue + F="$I/$word" + L="$(STRING "$DOC")" + + if LOCK "$F"; then + touch "$F" + { while read d l f n t; do + [ "$l" = "$L" ] \ + || printf "%i %s %f %i %i\n" \ + "$d" "$l" "$f" "$n" "$t" + done <"$F" + printf "%i %s %f %i %i\n" \ + "$_DATE" "$L" "$freq" "$num" "$total" + } >"$F.$$" + mv -- "$F.$$" "$F" + RELEASE "$F" + fi +done -- 2.39.2