#!/bin/sh read _DATE _date_n <<-EOF $(date +"%s %N") EOF . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh" . "${_EXEC:-${0%/*}}/cgilite/storage.sh" . "${_EXEC:-${0%/*}}/cgilite/json.sh" debug "$REQUEST_METHOD $REQUEST_URI $SERVER_PROTOCOL $_DATE" ingest() { local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip" # json_get "$J" title # json_get "$J" parts.comments case $(json_get "$J" title) in *.md|*.txt|*.csv) printf %s "$content" |base64 -d ;; *.pdf) printf %s "$content" |base64 -d \ | pdftotext - - ;; *.doc) printf %s "$content" |base64 -d \ | catdoc /dev/stdin ;; *.xls) printf %s "$content" |base64 -d \ | xls2csv /dev/stdin ;; *.ppt) printf %s "$content" |base64 -d \ | catppt /dev/stdin ;; *.html|*.xml|*.svg) printf %s "$content" |base64 -d \ | sed 's;<[^>]*>;;g' ;; *.docx) printf %s "$content" |base64 -d >"$ztmp" unzip -qc "$ztmp" word/document.xml \ | head -c 128M | sed 's;<[^>]*>;;g' rm -- "$ztmp" ;; *.xlsx) printf %s "$content" |base64 -d >"$ztmp" unzip -qc "$ztmp" xl/sharedStrings.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' rm -- "$ztmp" ;; *.odt) printf %s "$content" |base64 -d >"$ztmp" unzip -qc "$ztmp" content.xml \ | head -c 128M | sed 's;<[^>]*>;;g' rm -- "$ztmp" ;; *.ods|*.odp) printf %s "$content" |base64 -d >"$ztmp" unzip -qc "$ztmp" content.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' rm -- "$ztmp" ;; *):;; esac } search() { local index="$1" words w num total freq doc date J shift 1; words="$@" words="$(printf %s\\n "$words" | awk ' BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F if ( length("ยก") == 1 ) # Utf-8 aware AWK FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; else # UTF-8 Hack FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; fi } { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); } ')" for w in ${words}; do [ ! -f "${index}/$w" ] && continue while read num total freq doc date; do printf '%s-%i %f\n' "${doc}" "${date}" "$freq" done <"${index}/$w" done \ | awk ' { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; } END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m; for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d; } ' \ | sort -nr \ | while read freq doc; do date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")" if J="$(DBM "$_records" get "$doc")"; then [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \ && printf '%f %s %s\n' \ "$freq" "$(STRING "$doc")" "$(STRING "$J")" fi done } _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" _records="${_DATA}/${_INDEX}/_0_DOCS" if [ "${INDEX}" -a ! -d "${_DATA}/${_INDEX}" ]; then printf '%s\r\n' "Status: 404 Not Found" "" exit 0 elif authlist="$(DBM "${_DATA}/auth.db" get "${_INDEX}" )"; then auth="$(HEADER Authorization)" auth="${auth#Basic }" for a in $authlist deny; do [ "$auth" = "$a" ] && break done if [ "$a" = "deny" -o ! "$auth" ]; then printf '%s\r\n' "Status: 401 Unauthorized" \ "WWW-Authenticate: Basic realm=\"Rigid Find\"" "" \ | debug exit 0 fi unset a auth authlist fi if [ "$REQUEST_METHOD" = "PUT" ]; then _doc="${PATH_INFO#"/${_INDEX}/_doc"}" J="$(head -c "${CONTENT_LENGTH:-0}")" # Don't use json parser to get content field # Content can be very large and the json parser is slow content="$(printf %s\\n "$J" |sed -E ' :X; $bY; N; bX; :Y; s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";; s;".*$;; s;\\;;g; ')" J="$(printf %s\\n "$J" |sed -E ' :X; $bY; N; bX; :Y; s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";; ')" J="$(json_load "${J}")" debug "Content: ${#content} bytes" debug "$(json_dump "$J")" if [ "${#content}" -gt 0 ]; then ingest "$J" "$content"\ | "${_EXEC}/concordance.sh" \ "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE" fi J="${J#obj:}" J="$(DB2 "$J" set _indexdate num:"$_DATE")" if [ "${#content}" -eq 0 ]; then printf '%s: %s\r\n' "Status" "200 OK" result="updated" elif DBM "$_records" insert "$_doc" "$J"; then printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \ result="created" elif DBM "$_records" update "$_doc" "$J"; then printf '%s: %s\r\n' "Status" "200 OK" result="updated" else printf '%s\r\n' "Status: 500 Internal Server Error" "" exit 0 fi cat <<-EOF X-elastic-product: Elasticsearch content-type: application/vnd.elasticsearch+json;compatible-with=8 { "_index": $(json_dump str:"${_INDEX}"), "_id": $(json_dump str:"$_doc"), "result": "$result", "_indexdate": $_DATE } EOF exit 0 elif [ "$REQUEST_METHOD" = "DELETE" ]; then _doc="${PATH_INFO#"/${_INDEX}/_doc"}" if DBM "$_records" get "$_doc"; then if DBM "$_records" delete "$_doc"; then printf '%s: %s\r\n' "Status" "200 OK" result="deleted" else printf '%s\r\n' "Status: 500 Internal Server Error" "" exit 0 fi else printf '%s: %s\r\n' "Status" "404 Not Found" result="not_found" fi cat <<-EOF X-elastic-product: Elasticsearch content-type: application/vnd.elasticsearch+json;compatible-with=8 { "_index": $(json_dump str:"${_INDEX}"), "_id": $(json_dump str:"$_doc"), "result": "$result", "_indexdate": $_DATE } EOF exit 0 elif [ "$REQUEST_METHOD" = "POST" ]; then J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" J="$(json_get "$J" query.bool.must.bool.should)" words="$( for j in $(DB2 "$J" iterate @); do json_get "$(UNSTRING "$j")" match_phrase_prefix.content done 2>/dev/null |tr \\n ' ' )" debug "Search words: $words" results="@ $( search "${_DATA}/${_INDEX}" $words \ | while read -r score id source; do debug "Hit: $id $score" S="$(DB2 "" set _index str:"${_INDEX}")" S="$(DB2 "$S" set _id str:"$(UNSTRING "${id#/}")")" S="$(DB2 "$S" set _score num:"$score")" S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")" printf 'obj:%s\t' "$(STRING "$S")" done )" results="${results% }" t="$(( $(date +%s%N) - ${_DATE}${_date_n} ))" cat <<-EOF Status: 200 OK X-elastic-product: Elasticsearch Content-Type: application/vnd.elasticsearch+json;compatible-with=8 { "took":$((t / 1000000)), "timed_out":false, "hits": { "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"}, "max_score": $(json_get "arr:$results" '[0]._score' 2>/dev/null || printf 0), "hits": $(json_dump "arr:$results") } } EOF elif [ "$REQUEST_METHOD" = "HEAD" ]; then accept="$(HEADER Accept)" [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ && ctype="${accept}" || ctype="application/json" cat <<-EOF Status: 200 OK X-elastic-product: Elasticsearch content-type: ${ctype} EOF exit 0 else # elif [ "$REQUEST_METHOD" = "GET" ]; then cat <<-EOF Status: 501 Not Implemented X-elastic-product: Elasticsearch content-type: text/plain Use the Nextcloud Elastic Search Plugin to use this service. EOF exit 0 fi