From b812c21251a6dca0f419869c773d86cfc1bd3942 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Mon, 4 Mar 2024 18:32:10 +0100 Subject: [PATCH] index ingest emulating ElastiSearch --- concordance.sh | 42 ++++++++++ index.cgi | 206 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 248 insertions(+) create mode 100755 concordance.sh create mode 100755 index.cgi diff --git a/concordance.sh b/concordance.sh new file mode 100755 index 0000000..fbc90ad --- /dev/null +++ b/concordance.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +# Copyright 2023 - 2024 Paul Hänsch +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR +# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +# ============================================================================ +# Read a document from STDIN and write a word index into DIR + +DIR="$1" DOC_ID="$2" + +{ cat; printf \\n; } \ +| awk ' + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } + { for (n = 1; n <= NF; n++) { + if ( $n != "" && length($n) <= 128 ) { + words[tolower($n)]++; total++; + } } } + END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; } +' \ +| while read -r num total freq word; do + [ "$word" ] || continue + printf '%i %i %f %s\n' \ + "$num" "$total" "$freq" "$DOC_ID" \ + >>"$DIR/$word" +done diff --git a/index.cgi b/index.cgi new file mode 100755 index 0000000..cb325ea --- /dev/null +++ b/index.cgi @@ -0,0 +1,206 @@ +#!/bin/sh + +. "${_EXEC:-${0%/*}}/cgilite/cgilite.sh" +. "${_EXEC:-${0%/*}}/cgilite/storage.sh" +. "${_EXEC:-${0%/*}}/cgilite/json.sh" + +[ "$_DATE" ] || _DATE="$(date +%s)" + +_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" +_records="${_DATA}/${_INDEX}/_0_DOCS" + +ingest() { + local J="$1" + + # json_get "$J" title + # json_get "$J" parts.comments + + case $(json_get "$J" title) in + *.md|*.txt|*.csv) + json_get "$J" content |base64 -d + ;; + *.pdf) + json_get "$J" content |base64 -d \ + | pdftotext - + ;; + *.doc) + json_get "$J" content |base64 -d \ + | catdoc /dev/stdin + ;; + *.xls) + json_get "$J" content |base64 -d \ + | xls2csv /dev/stdin + ;; + *.ppt) + json_get "$J" content |base64 -d \ + | catppt /dev/stdin + ;; + *.html|*.xml|*.svg) + json_get "$J" content |base64 -d \ + | sed 's;<[^>]*>;;g' + ;; + *.docx) + json_get "$J" content |base64 -d \ + | unzip -qc /dev/stdin word/document.xml \ + | head -c 128M | sed 's;<[^>]*>;;g' + ;; + *.xlsx) + json_get "$J" content |base64 -d \ + | unzip -qc /dev/stdin xl/sharedStrings.xml \ + | head -c 128M | sed 's;<[^>]*>; ;g' + ;; + *.odt) + json_get "$J" content |base64 -d \ + | unzip -qc /dev/stdin content.xml \ + | head -c 128M | sed 's;<[^>]*>;;g' + ;; + *.ods|*.odp) + json_get "$J" content |base64 -d \ + | unzip -qc /dev/stdin content.xml \ + | head -c 128M | sed 's;<[^>]*>; ;g' + ;; + *):;; + esac +} + +if [ "$REQUEST_METHOD" = "PUT" ]; then + _doc="${PATH_INFO#"/${_INDEX}/_doc"}" + + J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" + + ingest "$J" \ + | "${_EXEC}/concordance.sh" \ + "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE" + + J="${J#obj:}" + J="$(DB2 "$J" delete content)" + J="$(DB2 "$J" set _indexdate num:"$_DATE")" + + if DBM "$_records" insert "$_doc" "$J"; then + printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \ + result="created" + elif DBM "$_records" update "$_doc" "$J"; then + printf '%s: %s\r\n' "Status" "200 OK" + result="updated" + else + printf '%s\r\n' "Status: 500 Internal Server Error" "" + exit 0 + fi + + sed 's;$;\r;' <<-EOF + X-elastic-product: Elasticsearch + content-type: application/vnd.elasticsearch+json;compatible-with=8 + + { "_index": $(json_dump str:"${_INDEX}"), + "_id": $(json_dump str:"$_doc"), + "result": "$result", + "_indexdate": $_DATE + } + EOF + exit 0 + +elif [ "$REQUEST_METHOD" = "DELETE" ]; then + _doc="${PATH_INFO#"/${_INDEX}/_doc"}" + + if DBM "$_records" get "$_doc"; then + if DBM "$_records" delete "$_doc"; then + printf '%s: %s\r\n' "Status" "200 OK" + result="deleted" + else + printf '%s\r\n' "Status: 500 Internal Server Error" "" + exit 0 + fi + else + printf '%s: %s\r\n' "Status" "404 Not Found" + result="not_found" + fi + + sed 's;$;\r;' <<-EOF + X-elastic-product: Elasticsearch + content-type: application/vnd.elasticsearch+json;compatible-with=8 + + { "_index": $(json_dump str:"${_INDEX}"), + "_id": $(json_dump str:"$_doc"), + "result": "$result", + "_indexdate": $_DATE + } + EOF + exit 0 + +elif [ "$REQUEST_METHOD" = "POST" ]; then + : +elif [ "$REQUEST_METHOD" = "HEAD" ]; then + accept="$(HEADER Accept)" + [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ + && ctype="${accept}" || ctype="application/json" + + sed 's;$;\r;' <<-EOF + HTTP/1.1 200 OK + X-elastic-product: Elasticsearch + content-type: ${ctype} + EOF + exit 0 + +elif [ "$REQUEST_METHOD" = "GET" ]; then + accept="$(HEADER Accept)" + [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ + && ctype="${accept}" || ctype="application/json" + + sed 's;$;\r;' <<-EOF + HTTP/1.1 200 OK + X-elastic-product: Elasticsearch + content-type: ${ctype} + + EOF + + if [ "$PATH_INFO" = "/${_INDEX}/" ]; then + sed 's;$;\r;' <<-EOF + { $(json_dump str:"${_INDEX}"): { + "aliases":{}, + "mappings": { + "properties": { + "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}}, + "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}}, + "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, + "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} + } + }, + "settings": { + "index": { + "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}}, + "number_of_shards":"1", + "provided_name": $(json_dump str:"${_INDEX}"), + "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")", + "number_of_replicas":"1", + "uuid":"0000000000000000000000", + "version":{"created":"8500010"} + } + } + } + } + EOF + else + sed 's;$;\r;' <<-EOF + { "name" : "head", + "cluster_name" : "elasticsearch", + "version" : { + "number" : "8.12.1", + "lucene_version" : "9.9.2", + "minimum_wire_compatibility_version" : "7.17.0", + "minimum_index_compatibility_version" : "7.0.0" + }, + "tagline" : "You Know, for Search" + } + EOF + fi + exit 0 + +else + printf '%s\r\n' "Status: 500 Internal Server Error" "" + exit 0 +fi -- 2.39.2