index ingest emulating ElastiSearch

author Paul Hänsch <paul@plutz.net>

Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)

committer Paul Hänsch <paul@plutz.net>

Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)
author Paul Hänsch <paul@plutz.net>
Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)
committer Paul Hänsch <paul@plutz.net>
Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)
diff --git a/concordance.sh b/concordance.sh

new file mode 100755 (executable)

index 0000000..fbc90ad
--- /dev/null
+++ b/concordance.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+# Copyright 2023 - 2024 Paul Hänsch
+# 
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+# 
+# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# ============================================================================
+# Read a document from STDIN and write a word index into DIR
+
+DIR="$1" DOC_ID="$2"
+
+{ cat; printf \\n; } \
+| awk '
+  BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+          if ( length("¡") == 1 )  # Utf-8 aware AWK
+          FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+          else                     # UTF-8 Hack
+          FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+          fi
+        }
+        { for (n = 1; n <= NF; n++) {
+            if ( $n != "" && length($n) <= 128 ) {
+              words[tolower($n)]++; total++;
+        } } }
+    END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; }
+' \
+| while read -r num total freq word; do
+  [ "$word" ] || continue
+  printf '%i   %i      %f      %s\n' \
+         "$num" "$total" "$freq" "$DOC_ID" \
+  >>"$DIR/$word"
+done
diff --git a/index.cgi b/index.cgi

new file mode 100755 (executable)

index 0000000..cb325ea
--- /dev/null
+++ b/index.cgi
@@ -0,0 +1,206 @@
+#!/bin/sh
+
+. "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
+. "${_EXEC:-${0%/*}}/cgilite/storage.sh"
+. "${_EXEC:-${0%/*}}/cgilite/json.sh"
+
+[ "$_DATE" ] || _DATE="$(date +%s)"
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
+ingest() {
+  local J="$1"
+
+  # json_get "$J" title
+  # json_get "$J" parts.comments
+
+  case $(json_get "$J" title) in
+    *.md|*.txt|*.csv)
+      json_get "$J" content |base64 -d
+      ;;
+    *.pdf)
+      json_get "$J" content |base64 -d \
+      | pdftotext -
+      ;;
+    *.doc)
+      json_get "$J" content |base64 -d \
+      | catdoc /dev/stdin
+      ;;
+    *.xls)
+      json_get "$J" content |base64 -d \
+      | xls2csv /dev/stdin
+      ;;
+    *.ppt)
+      json_get "$J" content |base64 -d \
+      | catppt /dev/stdin
+      ;;
+    *.html|*.xml|*.svg)
+      json_get "$J" content |base64 -d \
+      | sed 's;<[^>]*>;;g'
+      ;;
+    *.docx)
+      json_get "$J" content |base64 -d \
+      | unzip -qc /dev/stdin word/document.xml \
+      | head -c 128M | sed 's;<[^>]*>;;g'
+      ;;
+    *.xlsx)
+      json_get "$J" content |base64 -d \
+      | unzip -qc /dev/stdin xl/sharedStrings.xml \
+      | head -c 128M | sed 's;<[^>]*>; ;g'
+      ;;
+    *.odt)
+      json_get "$J" content |base64 -d \
+      | unzip -qc /dev/stdin content.xml \
+      | head -c 128M | sed 's;<[^>]*>;;g'
+      ;;
+    *.ods|*.odp)
+      json_get "$J" content |base64 -d \
+      | unzip -qc /dev/stdin content.xml \
+      | head -c 128M | sed 's;<[^>]*>; ;g'
+      ;;
+    *):;;
+  esac
+}
+
+if   [ "$REQUEST_METHOD" = "PUT" ]; then
+  _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
+
+  J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+  
+  ingest "$J" \
+  | "${_EXEC}/concordance.sh" \
+    "$_DATA/$_INDEX/" "$(STRING "$_doc")       $_DATE"
+
+  J="${J#obj:}"
+  J="$(DB2 "$J" delete content)"
+  J="$(DB2 "$J" set _indexdate num:"$_DATE")"
+
+  if   DBM "$_records" insert "$_doc" "$J"; then
+    printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
+    result="created"
+  elif DBM "$_records" update "$_doc" "$J"; then
+    printf '%s: %s\r\n' "Status" "200 OK"
+    result="updated"
+  else
+    printf '%s\r\n' "Status: 500 Internal Server Error" ""
+    exit 0
+  fi
+
+  sed 's;$;\r;' <<-EOF
+       X-elastic-product: Elasticsearch
+       content-type: application/vnd.elasticsearch+json;compatible-with=8
+
+       { "_index": $(json_dump str:"${_INDEX}"),
+         "_id": $(json_dump str:"$_doc"),
+         "result": "$result",
+         "_indexdate": $_DATE
+       }
+       EOF
+  exit 0
+
+elif [ "$REQUEST_METHOD" = "DELETE" ]; then
+  _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
+
+  if   DBM "$_records" get "$_doc"; then
+    if   DBM "$_records" delete "$_doc"; then
+      printf '%s: %s\r\n' "Status" "200 OK"
+      result="deleted"
+    else
+      printf '%s\r\n' "Status: 500 Internal Server Error" ""
+      exit 0
+    fi
+  else
+    printf '%s: %s\r\n' "Status" "404 Not Found"
+    result="not_found"
+  fi
+
+  sed 's;$;\r;' <<-EOF
+       X-elastic-product: Elasticsearch
+       content-type: application/vnd.elasticsearch+json;compatible-with=8
+
+       { "_index": $(json_dump str:"${_INDEX}"),
+         "_id": $(json_dump str:"$_doc"),
+         "result": "$result",
+         "_indexdate": $_DATE
+       }
+       EOF
+  exit 0
+
+elif [ "$REQUEST_METHOD" = "POST" ]; then
+  :
+elif [ "$REQUEST_METHOD" = "HEAD" ]; then
+  accept="$(HEADER Accept)"
+  [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
+  && ctype="${accept}" || ctype="application/json"
+
+  sed 's;$;\r;' <<-EOF
+       HTTP/1.1 200 OK
+       X-elastic-product: Elasticsearch
+       content-type: ${ctype}
+       EOF
+  exit 0
+
+elif [ "$REQUEST_METHOD" = "GET" ]; then
+  accept="$(HEADER Accept)"
+  [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
+  && ctype="${accept}" || ctype="application/json"
+
+  sed 's;$;\r;' <<-EOF
+       HTTP/1.1 200 OK
+       X-elastic-product: Elasticsearch
+       content-type: ${ctype}
+
+       EOF
+       
+  if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
+  sed 's;$;\r;' <<-EOF
+       { $(json_dump str:"${_INDEX}"): {
+           "aliases":{},
+           "mappings": {
+             "properties": {
+               "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
+               "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
+               "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+               "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
+             }
+           },
+           "settings": {
+             "index": {
+               "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
+               "number_of_shards":"1",
+               "provided_name": $(json_dump str:"${_INDEX}"),
+               "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
+               "number_of_replicas":"1",
+               "uuid":"0000000000000000000000",
+               "version":{"created":"8500010"}
+             }
+           }
+         }
+       }
+       EOF
+  else
+    sed 's;$;\r;' <<-EOF
+       { "name" : "head",
+         "cluster_name" : "elasticsearch",
+         "version" : {
+           "number" : "8.12.1",
+           "lucene_version" : "9.9.2",
+           "minimum_wire_compatibility_version" : "7.17.0",
+           "minimum_index_compatibility_version" : "7.0.0"
+         },
+         "tagline" : "You Know, for Search"
+       }
+       EOF
+  fi
+  exit 0
+
+else
+  printf '%s\r\n' "Status: 500 Internal Server Error" ""
+  exit 0
+fi
author	Paul Hänsch <paul@plutz.net>
	Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)
committer	Paul Hänsch <paul@plutz.net>
	Mon, 4 Mar 2024 17:32:10 +0000 (18:32 +0100)
concordance.sh	[new file with mode: 0755]	patch \| blob
index.cgi	[new file with mode: 0755]	patch \| blob