[ "$_DATE" ] || _DATE="$(date +%s)"
-_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
-_records="${_DATA}/${_INDEX}/_0_DOCS"
-
ingest() {
local J="$1"
case $(json_get "$J" title) in
*.md|*.txt|*.csv)
- json_get "$J" content |base64 -d
+ printf %s "$content" |base64 -d
;;
*.pdf)
- json_get "$J" content |base64 -d \
- | pdftotext -
+ printf %s "$content" |base64 -d \
+ | pdftotext - -
;;
*.doc)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catdoc /dev/stdin
;;
*.xls)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| xls2csv /dev/stdin
;;
*.ppt)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catppt /dev/stdin
;;
*.html|*.xml|*.svg)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| sed 's;<[^>]*>;;g'
;;
*.docx)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin word/document.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
;;
*.xlsx)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin xl/sharedStrings.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
;;
*.odt)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin content.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
;;
*.ods|*.odp)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin content.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
;;
esac
}
+search() {
+ local index="$1" words w num total freq doc date J
+ shift 1; words="$@"
+
+ words="$(printf %s\\n "$words" | awk '
+ BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+ if ( length("¡") == 1 ) # Utf-8 aware AWK
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+ else # UTF-8 Hack
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+ fi
+ }
+ { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); }
+ ')"
+
+ for w in ${words}; do
+ [ ! -f "${index}/$w" ] && continue
+
+ while read num total freq doc date; do
+ printf '%s-%i %f\n' "${doc}" "${date}" "$freq"
+ done <"${index}/$w"
+ done \
+ | awk '
+ { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
+ END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
+ for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d;
+ }
+ ' \
+ | sort -nr \
+ | while read freq doc; do
+ date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
+
+ if J="$(DBM "$_records" get "$doc")"; then
+ [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
+ && printf '%f %s %s\n' \
+ "$freq" "$(STRING "$doc")" "$(STRING "$J")"
+ fi
+ done
+}
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
if [ "$REQUEST_METHOD" = "PUT" ]; then
_doc="${PATH_INFO#"/${_INDEX}/_doc"}"
- J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+ J="$(head -c "${CONTENT_LENGTH:-0}")"
+ # Don't use json parser to get content field
+ # Content can be very large and the json parser is slow
+ content="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
+ s;".*$;;
+ s;\\;;g;
+ ')"
+ J="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
+ ')"
+ J="$(json_load "${J}")"
- ingest "$J" \
+ ingest "$J" "$content"\
| "${_EXEC}/concordance.sh" \
"$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
J="${J#obj:}"
- J="$(DB2 "$J" delete content)"
J="$(DB2 "$J" set _indexdate num:"$_DATE")"
if DBM "$_records" insert "$_doc" "$J"; then
exit 0
elif [ "$REQUEST_METHOD" = "POST" ]; then
- :
+ J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+ J="$(json_get "$J" query.bool.must.bool.should)"
+
+ words="$(
+ for j in $(DB2 "$J" iterate @); do
+ json_get "$(UNSTRING "$j")" match_phrase_prefix.content
+ done 2>/dev/null
+ )"
+
+ results="@ $(
+ search "${_DATA}/${_INDEX}" $words \
+ | while read -r score id source; do
+ S="$(DB2 "" set _id str:"$(UNSTRING "${id#/}")")"
+ S="$(DB2 "$S" set _score num:"$score")"
+ S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
+ printf 'obj:%s\t' "$(STRING "$S")"
+ done
+ )"
+ results="${results% }"
+
+ sed 's;$;\r;' <<-EOF
+ Status: 200 OK
+ X-elastic-product: Elasticsearch
+ Content-Type: application/vnd.elasticsearch+json;compatible-with=8
+
+ { "took":0,
+ "timed_out":false,
+ "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
+ "hits": {
+ "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
+ "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
+ "hits": $(json_dump "arr:$results")
+ }
+ }
+ EOF
+
elif [ "$REQUEST_METHOD" = "HEAD" ]; then
accept="$(HEADER Accept)"
[ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
HTTP/1.1 200 OK
X-elastic-product: Elasticsearch
content-type: ${ctype}
+
EOF
exit 0