[ "$_DATE" ] || _DATE="$(date +%s)"
-_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
-_records="${_DATA}/${_INDEX}/_0_DOCS"
+debug "$REQUEST_METHOD $REQUEST_URI $SERVER_PROTOCOL $_DATE"
ingest() {
local J="$1"
case $(json_get "$J" title) in
*.md|*.txt|*.csv)
- json_get "$J" content |base64 -d
+ printf %s "$content" |base64 -d
;;
*.pdf)
- json_get "$J" content |base64 -d \
- | pdftotext -
+ printf %s "$content" |base64 -d \
+ | pdftotext - -
;;
*.doc)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catdoc /dev/stdin
;;
*.xls)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| xls2csv /dev/stdin
;;
*.ppt)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| catppt /dev/stdin
;;
*.html|*.xml|*.svg)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| sed 's;<[^>]*>;;g'
;;
*.docx)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin word/document.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
;;
*.xlsx)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin xl/sharedStrings.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
;;
*.odt)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin content.xml \
| head -c 128M | sed 's;<[^>]*>;;g'
;;
*.ods|*.odp)
- json_get "$J" content |base64 -d \
+ printf %s "$content" |base64 -d \
| unzip -qc /dev/stdin content.xml \
| head -c 128M | sed 's;<[^>]*>; ;g'
;;
esac
}
+search() {
+ local index="$1" words w num total freq doc date J
+ shift 1; words="$@"
+
+ words="$(printf %s\\n "$words" | awk '
+ BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+ if ( length("ยก") == 1 ) # Utf-8 aware AWK
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+ else # UTF-8 Hack
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+ fi
+ }
+ { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); }
+ ')"
+
+ for w in ${words}; do
+ [ ! -f "${index}/$w" ] && continue
+
+ while read num total freq doc date; do
+ printf '%s-%i %f\n' "${doc}" "${date}" "$freq"
+ done <"${index}/$w"
+ done \
+ | awk '
+ { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
+ END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
+ for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d;
+ }
+ ' \
+ | sort -nr \
+ | while read freq doc; do
+ date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
+
+ if J="$(DBM "$_records" get "$doc")"; then
+ [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
+ && printf '%f %s %s\n' \
+ "$freq" "$(STRING "$doc")" "$(STRING "$J")"
+ fi
+ done
+}
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
if [ "$REQUEST_METHOD" = "PUT" ]; then
_doc="${PATH_INFO#"/${_INDEX}/_doc"}"
- J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
-
- ingest "$J" \
- | "${_EXEC}/concordance.sh" \
- "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
+ J="$(head -c "${CONTENT_LENGTH:-0}")"
+ # Don't use json parser to get content field
+ # Content can be very large and the json parser is slow
+ content="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
+ s;".*$;;
+ s;\\;;g;
+ ')"
+ J="$(printf %s\\n "$J" |sed -E '
+ :X; $bY; N; bX; :Y;
+ s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
+ ')"
+ J="$(json_load "${J}")"
+
+ debug "Content: ${#content} bytes"
+ debug "$(json_dump "$J")"
+
+ if [ "${#content}" -gt 0 ]; then
+ ingest "$J" "$content"\
+ | "${_EXEC}/concordance.sh" \
+ "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE"
+ fi
J="${J#obj:}"
- J="$(DB2 "$J" delete content)"
J="$(DB2 "$J" set _indexdate num:"$_DATE")"
- if DBM "$_records" insert "$_doc" "$J"; then
+ if [ "${#content}" -eq 0 ]; then
+ printf '%s: %s\r\n' "Status" "200 OK"
+ result="updated"
+ elif DBM "$_records" insert "$_doc" "$J"; then
printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
result="created"
elif DBM "$_records" update "$_doc" "$J"; then
exit 0
fi
- sed 's;$;\r;' <<-EOF
- X-elastic-product: Elasticsearch
- content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+ cat <<-EOF
+ X-elastic-product: Elasticsearch\r
+ content-type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
{ "_index": $(json_dump str:"${_INDEX}"),
"_id": $(json_dump str:"$_doc"),
"result": "$result",
result="not_found"
fi
- sed 's;$;\r;' <<-EOF
- X-elastic-product: Elasticsearch
- content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+ cat <<-EOF
+ X-elastic-product: Elasticsearch\r
+ content-type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
{ "_index": $(json_dump str:"${_INDEX}"),
"_id": $(json_dump str:"$_doc"),
"result": "$result",
exit 0
elif [ "$REQUEST_METHOD" = "POST" ]; then
- :
+ J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+ J="$(json_get "$J" query.bool.must.bool.should)"
+
+ words="$(
+ for j in $(DB2 "$J" iterate @); do
+ json_get "$(UNSTRING "$j")" match_phrase_prefix.content
+ done 2>/dev/null
+ )"
+
+ results="@ $(
+ search "${_DATA}/${_INDEX}" $words \
+ | while read -r score id source; do
+ S="$(DB2 "" set _index str:"${_INDEX}")"
+ S="$(DB2 "$S" set _id str:"$(UNSTRING "${id#/}")")"
+ S="$(DB2 "$S" set _score num:"$score")"
+ S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
+ printf 'obj:%s\t' "$(STRING "$S")"
+ done
+ )"
+ results="${results% }"
+
+ cat <<-EOF
+ Status: 200 OK\r
+ X-elastic-product: Elasticsearch\r
+ Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
+ \r
+ { "took":0,
+ "timed_out":false,
+ "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
+ "hits": {
+ "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
+ "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
+ "hits": $(json_dump "arr:$results")
+ }
+ }
+ EOF
+
elif [ "$REQUEST_METHOD" = "HEAD" ]; then
accept="$(HEADER Accept)"
[ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
&& ctype="${accept}" || ctype="application/json"
- sed 's;$;\r;' <<-EOF
- HTTP/1.1 200 OK
- X-elastic-product: Elasticsearch
- content-type: ${ctype}
+ cat <<-EOF
+ HTTP/1.1 200 OK\r
+ X-elastic-product: Elasticsearch\r
+ content-type: ${ctype}\r
+ \r
EOF
exit 0
[ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
&& ctype="${accept}" || ctype="application/json"
- sed 's;$;\r;' <<-EOF
- HTTP/1.1 200 OK
- X-elastic-product: Elasticsearch
- content-type: ${ctype}
-
+ cat <<-EOF
+ HTTP/1.1 200 OK\r
+ X-elastic-product: Elasticsearch\r
+ content-type: ${ctype}\r
+ \r
EOF
if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
- sed 's;$;\r;' <<-EOF
+ cat <<-EOF
{ $(json_dump str:"${_INDEX}"): {
"aliases":{},
"mappings": {
}
EOF
else
- sed 's;$;\r;' <<-EOF
+ cat <<-EOF
{ "name" : "head",
"cluster_name" : "elasticsearch",
"version" : {