From fb9c64d7613820866218002d77a3f7f41d602dc0 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Mon, 4 Mar 2024 22:18:32 +0100 Subject: [PATCH] faster ingest; search function --- index.cgi | 124 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 106 insertions(+), 18 deletions(-) diff --git a/index.cgi b/index.cgi index cb325ea..61f811e 100755 --- a/index.cgi +++ b/index.cgi @@ -6,9 +6,6 @@ [ "$_DATE" ] || _DATE="$(date +%s)" -_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" -_records="${_DATA}/${_INDEX}/_0_DOCS" - ingest() { local J="$1" @@ -17,45 +14,45 @@ ingest() { case $(json_get "$J" title) in *.md|*.txt|*.csv) - json_get "$J" content |base64 -d + printf %s "$content" |base64 -d ;; *.pdf) - json_get "$J" content |base64 -d \ - | pdftotext - + printf %s "$content" |base64 -d \ + | pdftotext - - ;; *.doc) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | catdoc /dev/stdin ;; *.xls) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | xls2csv /dev/stdin ;; *.ppt) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | catppt /dev/stdin ;; *.html|*.xml|*.svg) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | sed 's;<[^>]*>;;g' ;; *.docx) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | unzip -qc /dev/stdin word/document.xml \ | head -c 128M | sed 's;<[^>]*>;;g' ;; *.xlsx) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | unzip -qc /dev/stdin xl/sharedStrings.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' ;; *.odt) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | unzip -qc /dev/stdin content.xml \ | head -c 128M | sed 's;<[^>]*>;;g' ;; *.ods|*.odp) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | unzip -qc /dev/stdin content.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' ;; @@ -63,17 +60,72 @@ ingest() { esac } +search() { + local index="$1" words w num total freq doc date J + shift 1; words="$@" + + words="$(printf %s\\n "$words" | awk ' + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } + { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); } + ')" + + for w in ${words}; do + [ ! -f "${index}/$w" ] && continue + + while read num total freq doc date; do + printf '%s-%i %f\n' "${doc}" "${date}" "$freq" + done <"${index}/$w" + done \ + | awk ' + { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; } + END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m; + for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d; + } + ' \ + | sort -nr \ + | while read freq doc; do + date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")" + + if J="$(DBM "$_records" get "$doc")"; then + [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \ + && printf '%f %s %s\n' \ + "$freq" "$(STRING "$doc")" "$(STRING "$J")" + fi + done +} + +_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" +_records="${_DATA}/${_INDEX}/_0_DOCS" + if [ "$REQUEST_METHOD" = "PUT" ]; then _doc="${PATH_INFO#"/${_INDEX}/_doc"}" - J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" + J="$(head -c "${CONTENT_LENGTH:-0}")" + # Don't use json parser to get content field + # Content can be very large and the json parser is slow + content="$(printf %s\\n "$J" |sed -E ' + :X; $bY; N; bX; :Y; + s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";; + s;".*$;; + s;\\;;g; + ')" + J="$(printf %s\\n "$J" |sed -E ' + :X; $bY; N; bX; :Y; + s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";; + ')" + J="$(json_load "${J}")" - ingest "$J" \ + ingest "$J" "$content"\ | "${_EXEC}/concordance.sh" \ "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE" J="${J#obj:}" - J="$(DB2 "$J" delete content)" J="$(DB2 "$J" set _indexdate num:"$_DATE")" if DBM "$_records" insert "$_doc" "$J"; then @@ -128,7 +180,42 @@ elif [ "$REQUEST_METHOD" = "DELETE" ]; then exit 0 elif [ "$REQUEST_METHOD" = "POST" ]; then - : + J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" + J="$(json_get "$J" query.bool.must.bool.should)" + + words="$( + for j in $(DB2 "$J" iterate @); do + json_get "$(UNSTRING "$j")" match_phrase_prefix.content + done 2>/dev/null + )" + + results="@ $( + search "${_DATA}/${_INDEX}" $words \ + | while read -r score id source; do + S="$(DB2 "" set _id str:"$(UNSTRING "${id#/}")")" + S="$(DB2 "$S" set _score num:"$score")" + S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")" + printf 'obj:%s\t' "$(STRING "$S")" + done + )" + results="${results% }" + + sed 's;$;\r;' <<-EOF + Status: 200 OK + X-elastic-product: Elasticsearch + Content-Type: application/vnd.elasticsearch+json;compatible-with=8 + + { "took":0, + "timed_out":false, + "_shards":{"total":1,"successful":1,"skipped":0,"failed":0}, + "hits": { + "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"}, + "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0), + "hits": $(json_dump "arr:$results") + } + } + EOF + elif [ "$REQUEST_METHOD" = "HEAD" ]; then accept="$(HEADER Accept)" [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ @@ -138,6 +225,7 @@ elif [ "$REQUEST_METHOD" = "HEAD" ]; then HTTP/1.1 200 OK X-elastic-product: Elasticsearch content-type: ${ctype} + EOF exit 0 -- 2.39.2