From c549e1de0245f399b219be8a2bdf1ee7d6152951 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Sat, 23 Sep 2023 23:29:25 +0200 Subject: [PATCH] serchindex script for pruning and indexing --- maintenance.sh | 75 ------------------------- searchindex.sh | 147 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+), 75 deletions(-) delete mode 100755 maintenance.sh create mode 100755 searchindex.sh diff --git a/maintenance.sh b/maintenance.sh deleted file mode 100755 index d94d6e3..0000000 --- a/maintenance.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/sh - -export _EXEC="${0%/*}/" _DATA="" # _DATE="$(date +%s)" -verb="" - -help() { - ex="$1" - - cat >&2 <<-EOF - This script should be run regularly via cron to remove outdated - records from search the index. - - USAGE: - - INSTALL_DIR/maintenance.sh SITE_DIR - - maintenance.sh --exec "INSTALL_DIR" --data "SITE_DIR" - - Options: - - --exec INSTALL_DIR - Point to the location of your shellwiki installation. This will - default to the path at which the script is called, if it can be - determined. - - --data SITE_DIR - Point to the location of your site installation. I.e. the directory - containing your "pages/" and "index/" dir. - - EOF - - exit "${ex:-0}" -} - -while [ $# -gt 0 ]; do case $1 in - --exec|-e) _EXEC="${2%/}"; shift 2;; - --data|-d) _DATA="${2%/}"; shift 2;; - --verbose|-v) verb=true; shift 1;; - --help) help 0;; - *) [ ! "$_DATA" ] \ - && _DATA="${1%/}" \ - || help 1 - ;; -esac; done - -if ! [ -d "$_DATA/pages/" -a -d "$_DATA/index/" ]; then - printf 'ERROR: %s\n\n' "\"${_DATA}\" does not seem to be valid site directory" >&2 - help 1 -fi -if ! [ -x "$_EXEC/parsers/40_indexer.sh" -a -x "$_EXEC/cgilite/storage.sh" ]; then - printf 'ERROR: %s\n\n' "could not determine shellwiki installation path (tried \"$_EXEC\")" >&2 - help 1 -fi - -. "$_EXEC/cgilite/storage.sh" - -for word in "$_DATA/index"/*; do - [ "$word" = "$_DATA/index/*" ] && continue - - [ "$verb" ] && printf '\r \r%s\r' "${word##*/}" >&2 - mv -- "$word" "${word}.$$" - - while read -r date location freq num total; do - l="$_DATA/pages$(UNSTRING "$location")#index.flag" - d="$(stat -c %Y "$l")" 2>&- - - if [ "$date" -ge "$d" ] 2>&-; then - printf '%i %s %f %i %i\n' \ - "$date" "$location" "$freq" "$num" "$total" - elif [ "$verb" ]; then - printf 'Removing "%s" from "%s"\n' "$location" "$word" >&2 - fi - done <"${word}.$$" >>"${word}" - rm -- "${word}.$$" -done diff --git a/searchindex.sh b/searchindex.sh new file mode 100755 index 0000000..af120f6 --- /dev/null +++ b/searchindex.sh @@ -0,0 +1,147 @@ +#!/bin/sh + +export _EXEC="${0%/*}/" _DATA="." +verb="" v=0 cmd="" force="" location="" + +help() { + ex="$1" + + cat >&2 <<-EOF + USAGE: + + ${0##*/} prune [--exec "INSTALL_DIR"] [--data "SITE_DIR"] [-v] + + ${0##*/} index [--exec "INSTALL_DIR"] [--data "SITE_DIR"] \\ + [--location "/PAGE"] [--force] [-v] + + Commands: + + prune + Remove outdated records from the database. This is usually + more time consuming than index creation. It is generally + save to run pruning while the wiki is online, even when + pages are being updated. Although in rare cases a search + operation may return incomplete results while running on + a database being pruned. + + Pruning mode should be called regularly via cron. + + index + Add pages to the search index. Pages with a current index + will be skipped unless the --force option is provided. + Optionally a --location can be provided to add only a + part of the document tree. + + When running indexing and pruning together, indexing should be run + first and pruning afterwards. + + Pruning becomes necessary with page updates, not during mere read + operation. On a medium traffic installation pruning should be run + about once a week. + Pruning the index more often than daily will rarely be necessary + and with low traffic sites monthly maintenance may be completely + fine. + + Options: + + --exec INSTALL_DIR + Point to the location of your shellwiki installation. This will + default to the path at which the script is called, if it can be + determined. + + --data SITE_DIR + Point to the location of your site installation. I.e. the directory + containing your "pages/" and "index/" dir. Defaults to working + directory. + + --force + Add pages to index even if they seem to be indexed already. + + --loction /PAGE + Index only the given page and its children. The path is given + relative to the web root, i.e. without the DATA and "page/" + directory. + + -v + Be more verbose. + EOF + + exit "${ex:-0}" +} + +while [ $# -gt 0 ]; do case $1 in + --exec|-e) _EXEC="${2%/}"; shift 2;; + --data|-d) _DATA="${2%/}"; shift 2;; + --verbose|-v) verb=true; shift 1;; + --force) force=true; shift 1;; + --help) help 0 2>&1;; + prune|index) + [ ! "$cmd" ] && cmd="$1" || help 1 + shift 1;; + *) help 1;; +esac; done + +if ! [ -d "$_DATA/pages/" -a -d "$_DATA/index/" ]; then + printf 'ERROR: %s\nTry --help\n' "\"${_DATA}\" does not seem to be a valid site directory" >&2 + exit 1 +fi +if ! [ -x "$_EXEC/parsers/40_indexer.sh" -a -x "$_EXEC/cgilite/storage.sh" ]; then + printf 'ERROR: %s\nTry --help\n' "could not determine shellwiki installation path (tried \"$_EXEC\")" >&2 + exit 1 +fi +if [ ! "$cmd" ]; then + help 1 +fi + +. "$_EXEC/cgilite/storage.sh" + +prune() { + for word in "$_DATA/index"/*; do + [ "$word" = "$_DATA/index/*" ] && continue + + [ "$verb" ] && printf "%${v}s\r%s\r" "" "${word##*/}" >&2 + v="${#word}" + mv -- "$word" "${word}.$$" + + while read -r date location freq num total; do + l="$_DATA/pages$(UNSTRING "$location")#index.flag" + d="$(stat -c %Y "$l")" 2>&- + + if [ "$date" -ge "$d" ] 2>&-; then + printf '%i %s %f %i %i\n' \ + "$date" "$location" "$freq" "$num" "$total" + elif [ "$verb" ]; then + printf "%${v}s\rRemoving \"%s\" from \"%s\"\n" "" "$location" "$word" >&2 + fi + done <"${word}.$$" >>"${word}" + rm -- "${word}.$$" + done +} + +index() { + export PATH_INFO="" _DATE="$(date +%s)" + + if [ "$location" ]; then + location="${location#/}" location="${location%/}" + printf %s\\n "/${location}/" + find "$_DATA/pages/" -type d -path "$_DATA/pages/${location}/*" -not -name "#*" -printf "/%P/\n" + else + find "$_DATA/pages/" -type d -not -name "#*" -printf "/%P/\n" + fi \ + | while read PATH_INFO; do + [ "$force" ] && rm -f -- "$_DATA/pages/$PATH_INFO/#index.flag" + if [ "$_DATA/pages/$PATH_INFO/#page.md" -nt "$_DATA/pages/$PATH_INFO/#index.flag" \ + -o -f "$_DATA/pages/$PATH_INFO/#page.md" \ + -a ! -f "$_DATA/pages/$PATH_INFO/#index.flag" ] 2>&- + then + [ "$verb" ] && printf "%${v}s\r%s\r" "$PATH_INFO" >&2 + v="${#PATH_INFO}" + "$_EXEC/parsers/40_indexer.sh" <"$_DATA/pages/$PATH_INFO/#page.md" >/dev/null + fi + done +} + +case $cmd in + index) index;; + prune) prune;; +esac -- 2.39.2