--- /dev/null
+#!/bin/sh
+
+# Copyright 2023 - 2024 Paul Hänsch
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+# IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# ============================================================================
+# Read a document from STDIN and write a word index into DIR
+
+DIR="$1" DOC_ID="$2"
+
+{ cat; printf \\n; } \
+| awk '
+ BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+ if ( length("¡") == 1 ) # Utf-8 aware AWK
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+ else # UTF-8 Hack
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+ fi
+ }
+ { for (n = 1; n <= NF; n++) {
+ if ( $n != "" && length($n) <= 128 ) {
+ words[tolower($n)]++; total++;
+ } } }
+ END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; }
+' \
+| while read -r num total freq word; do
+ [ "$word" ] || continue
+ printf '%i %i %f %s\n' \
+ "$num" "$total" "$freq" "$DOC_ID" \
+ >>"$DIR/$word"
+done
--- /dev/null
+#!/bin/sh
+
+. "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
+. "${_EXEC:-${0%/*}}/cgilite/storage.sh"
+. "${_EXEC:-${0%/*}}/cgilite/json.sh"
+
+[ "$_DATE" ] || _DATE="$(date +%s)"
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
+ingest() {
+ local J="$1"
+
+ # json_get "$J" title
+ # json_get "$J" parts.comments
+
+ case $(json_get "$J" title) in
+ *.md|*.txt|*.csv)
+ json_get "$J" content |base64 -d
+ ;;
+ *.pdf)
+ json_get "$J" content |base64 -d \
+ | pdftotext -
+ ;;
+ *.doc)
+ json_get "$J" content |base64 -d \
+ | catdoc /dev/stdin
+ ;;
+ *.xls)
+ json_get "$J" content |base64 -d \
+ | xls2csv /dev/stdin
+ ;;
+ *.ppt)
+ json_get "$J" content |base64 -d \
+ | catppt /dev/stdin
+ ;;
+ *.html|*.xml|*.svg)
+ json_get "$J" content |base64 -d \
+ | sed 's;<[^>]*>;;g'
+ ;;
+ *.docx)
+ json_get "$J" content |base64 -d \
+ | unzip -qc /dev/stdin word/document.xml \
+ | head -c 128M | sed 's;<[^>]*>;;g'
+ ;;
+ *.xlsx)
+ json_get "$J" content |base64 -d \
+ | unzip -qc /dev/stdin xl/sharedStrings.xml \
+ | head -c 128M | sed 's;<[^>]*>; ;g'
+ ;;
+ *.odt)
+ json_get "$J" content |base64 -d \
+ | unzip -qc /dev/stdin content.xml \
+ | head -c 128M | sed 's;<[^>]*>;;g'
+ ;;
+ *.ods|*.odp)
+ json_get "$J" content |base64 -d \
+ | unzip -qc /dev/stdin content.xml \
+ | head -c 128M | sed 's;<[^>]*>; ;g'
+ ;;
+ *):;;
+ esac
+}
+
+if [ "$REQUEST_METHOD" = "PUT" ]; then
+ _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
+
+ J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+
+ ingest "$J" \
+ | "${_EXEC}/concordance.sh" \
+ "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
+
+ J="${J#obj:}"
+ J="$(DB2 "$J" delete content)"
+ J="$(DB2 "$J" set _indexdate num:"$_DATE")"
+
+ if DBM "$_records" insert "$_doc" "$J"; then
+ printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
+ result="created"
+ elif DBM "$_records" update "$_doc" "$J"; then
+ printf '%s: %s\r\n' "Status" "200 OK"
+ result="updated"
+ else
+ printf '%s\r\n' "Status: 500 Internal Server Error" ""
+ exit 0
+ fi
+
+ sed 's;$;\r;' <<-EOF
+ X-elastic-product: Elasticsearch
+ content-type: application/vnd.elasticsearch+json;compatible-with=8
+
+ { "_index": $(json_dump str:"${_INDEX}"),
+ "_id": $(json_dump str:"$_doc"),
+ "result": "$result",
+ "_indexdate": $_DATE
+ }
+ EOF
+ exit 0
+
+elif [ "$REQUEST_METHOD" = "DELETE" ]; then
+ _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
+
+ if DBM "$_records" get "$_doc"; then
+ if DBM "$_records" delete "$_doc"; then
+ printf '%s: %s\r\n' "Status" "200 OK"
+ result="deleted"
+ else
+ printf '%s\r\n' "Status: 500 Internal Server Error" ""
+ exit 0
+ fi
+ else
+ printf '%s: %s\r\n' "Status" "404 Not Found"
+ result="not_found"
+ fi
+
+ sed 's;$;\r;' <<-EOF
+ X-elastic-product: Elasticsearch
+ content-type: application/vnd.elasticsearch+json;compatible-with=8
+
+ { "_index": $(json_dump str:"${_INDEX}"),
+ "_id": $(json_dump str:"$_doc"),
+ "result": "$result",
+ "_indexdate": $_DATE
+ }
+ EOF
+ exit 0
+
+elif [ "$REQUEST_METHOD" = "POST" ]; then
+ :
+elif [ "$REQUEST_METHOD" = "HEAD" ]; then
+ accept="$(HEADER Accept)"
+ [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
+ && ctype="${accept}" || ctype="application/json"
+
+ sed 's;$;\r;' <<-EOF
+ HTTP/1.1 200 OK
+ X-elastic-product: Elasticsearch
+ content-type: ${ctype}
+ EOF
+ exit 0
+
+elif [ "$REQUEST_METHOD" = "GET" ]; then
+ accept="$(HEADER Accept)"
+ [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
+ && ctype="${accept}" || ctype="application/json"
+
+ sed 's;$;\r;' <<-EOF
+ HTTP/1.1 200 OK
+ X-elastic-product: Elasticsearch
+ content-type: ${ctype}
+
+ EOF
+
+ if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
+ sed 's;$;\r;' <<-EOF
+ { $(json_dump str:"${_INDEX}"): {
+ "aliases":{},
+ "mappings": {
+ "properties": {
+ "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
+ "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
+ "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
+ "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
+ }
+ },
+ "settings": {
+ "index": {
+ "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
+ "number_of_shards":"1",
+ "provided_name": $(json_dump str:"${_INDEX}"),
+ "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
+ "number_of_replicas":"1",
+ "uuid":"0000000000000000000000",
+ "version":{"created":"8500010"}
+ }
+ }
+ }
+ }
+ EOF
+ else
+ sed 's;$;\r;' <<-EOF
+ { "name" : "head",
+ "cluster_name" : "elasticsearch",
+ "version" : {
+ "number" : "8.12.1",
+ "lucene_version" : "9.9.2",
+ "minimum_wire_compatibility_version" : "7.17.0",
+ "minimum_index_compatibility_version" : "7.0.0"
+ },
+ "tagline" : "You Know, for Search"
+ }
+ EOF
+ fi
+ exit 0
+
+else
+ printf '%s\r\n' "Status: 500 Internal Server Error" ""
+ exit 0
+fi