X-Git-Url: https://git.plutz.net//?a=blobdiff_plain;f=index.cgi;h=0078a2f8f1837201ae7fc7a19324cd0fe01d4818;hb=092bfad64ae6b2022991f6193f042d6d53bf10c2;hp=cb325ea4a8e78b366d6e07e3ccade0e71f9191e4;hpb=b812c21251a6dca0f419869c773d86cfc1bd3942;p=rigidfind diff --git a/index.cgi b/index.cgi index cb325ea..0078a2f 100755 --- a/index.cgi +++ b/index.cgi @@ -6,77 +6,143 @@ [ "$_DATE" ] || _DATE="$(date +%s)" -_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" -_records="${_DATA}/${_INDEX}/_0_DOCS" +debug "$REQUEST_METHOD $REQUEST_URI $SERVER_PROTOCOL $_DATE" ingest() { - local J="$1" + local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip" # json_get "$J" title # json_get "$J" parts.comments case $(json_get "$J" title) in *.md|*.txt|*.csv) - json_get "$J" content |base64 -d + printf %s "$content" |base64 -d ;; *.pdf) - json_get "$J" content |base64 -d \ - | pdftotext - + printf %s "$content" |base64 -d \ + | pdftotext - - ;; *.doc) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | catdoc /dev/stdin ;; *.xls) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | xls2csv /dev/stdin ;; *.ppt) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | catppt /dev/stdin ;; *.html|*.xml|*.svg) - json_get "$J" content |base64 -d \ + printf %s "$content" |base64 -d \ | sed 's;<[^>]*>;;g' ;; *.docx) - json_get "$J" content |base64 -d \ - | unzip -qc /dev/stdin word/document.xml \ + printf %s "$content" |base64 -d >"$ztmp" + unzip -qc "$ztmp" word/document.xml \ | head -c 128M | sed 's;<[^>]*>;;g' + rm -- "$ztmp" ;; *.xlsx) - json_get "$J" content |base64 -d \ - | unzip -qc /dev/stdin xl/sharedStrings.xml \ + printf %s "$content" |base64 -d >"$ztmp" + unzip -qc "$ztmp" xl/sharedStrings.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' + rm -- "$ztmp" ;; *.odt) - json_get "$J" content |base64 -d \ - | unzip -qc /dev/stdin content.xml \ + printf %s "$content" |base64 -d >"$ztmp" + unzip -qc "$ztmp" content.xml \ | head -c 128M | sed 's;<[^>]*>;;g' + rm -- "$ztmp" ;; *.ods|*.odp) - json_get "$J" content |base64 -d \ - | unzip -qc /dev/stdin content.xml \ + printf %s "$content" |base64 -d >"$ztmp" + unzip -qc "$ztmp" content.xml \ | head -c 128M | sed 's;<[^>]*>; ;g' + rm -- "$ztmp" ;; *):;; esac } +search() { + local index="$1" words w num total freq doc date J + shift 1; words="$@" + + words="$(printf %s\\n "$words" | awk ' + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } + { for (n = 1; n <= NF; n++) printf "%s ", tolower($n); } + ')" + + for w in ${words}; do + [ ! -f "${index}/$w" ] && continue + + while read num total freq doc date; do + printf '%s-%i %f\n' "${doc}" "${date}" "$freq" + done <"${index}/$w" + done \ + | awk ' + { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; } + END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m; + for (d in cnt) if ( cnt[d] == m ) printf "%f %s\n", weight[d], d; + } + ' \ + | sort -nr \ + | while read freq doc; do + date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")" + + if J="$(DBM "$_records" get "$doc")"; then + [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \ + && printf '%f %s %s\n' \ + "$freq" "$(STRING "$doc")" "$(STRING "$J")" + fi + done +} + +_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}" +_records="${_DATA}/${_INDEX}/_0_DOCS" + if [ "$REQUEST_METHOD" = "PUT" ]; then _doc="${PATH_INFO#"/${_INDEX}/_doc"}" - J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" - - ingest "$J" \ - | "${_EXEC}/concordance.sh" \ - "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE" + J="$(head -c "${CONTENT_LENGTH:-0}")" + # Don't use json parser to get content field + # Content can be very large and the json parser is slow + content="$(printf %s\\n "$J" |sed -E ' + :X; $bY; N; bX; :Y; + s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";; + s;".*$;; + s;\\;;g; + ')" + J="$(printf %s\\n "$J" |sed -E ' + :X; $bY; N; bX; :Y; + s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";; + ')" + J="$(json_load "${J}")" + + debug "Content: ${#content} bytes" + debug "$(json_dump "$J")" + + if [ "${#content}" -gt 0 ]; then + ingest "$J" "$content"\ + | "${_EXEC}/concordance.sh" \ + "$_DATA/$_INDEX/" "$(STRING "$_doc") $_DATE" + fi J="${J#obj:}" - J="$(DB2 "$J" delete content)" J="$(DB2 "$J" set _indexdate num:"$_DATE")" - if DBM "$_records" insert "$_doc" "$J"; then + if [ "${#content}" -eq 0 ]; then + printf '%s: %s\r\n' "Status" "200 OK" + result="updated" + elif DBM "$_records" insert "$_doc" "$J"; then printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \ result="created" elif DBM "$_records" update "$_doc" "$J"; then @@ -87,10 +153,10 @@ if [ "$REQUEST_METHOD" = "PUT" ]; then exit 0 fi - sed 's;$;\r;' <<-EOF - X-elastic-product: Elasticsearch - content-type: application/vnd.elasticsearch+json;compatible-with=8 - + cat <<-EOF + X-elastic-product: Elasticsearch + content-type: application/vnd.elasticsearch+json;compatible-with=8 + { "_index": $(json_dump str:"${_INDEX}"), "_id": $(json_dump str:"$_doc"), "result": "$result", @@ -115,10 +181,10 @@ elif [ "$REQUEST_METHOD" = "DELETE" ]; then result="not_found" fi - sed 's;$;\r;' <<-EOF - X-elastic-product: Elasticsearch - content-type: application/vnd.elasticsearch+json;compatible-with=8 - + cat <<-EOF + X-elastic-product: Elasticsearch + content-type: application/vnd.elasticsearch+json;compatible-with=8 + { "_index": $(json_dump str:"${_INDEX}"), "_id": $(json_dump str:"$_doc"), "result": "$result", @@ -128,16 +194,53 @@ elif [ "$REQUEST_METHOD" = "DELETE" ]; then exit 0 elif [ "$REQUEST_METHOD" = "POST" ]; then - : + J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")" + J="$(json_get "$J" query.bool.must.bool.should)" + + words="$( + for j in $(DB2 "$J" iterate @); do + json_get "$(UNSTRING "$j")" match_phrase_prefix.content + done 2>/dev/null + )" + + results="@ $( + search "${_DATA}/${_INDEX}" $words \ + | while read -r score id source; do + S="$(DB2 "" set _index str:"${_INDEX}")" + S="$(DB2 "$S" set _id str:"$(UNSTRING "${id#/}")")" + S="$(DB2 "$S" set _score num:"$score")" + S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")" + printf 'obj:%s\t' "$(STRING "$S")" + done + )" + results="${results% }" + + cat <<-EOF + Status: 200 OK + X-elastic-product: Elasticsearch + Content-Type: application/vnd.elasticsearch+json;compatible-with=8 + + { "took":0, + "timed_out":false, + "_shards":{"total":1,"successful":1,"skipped":0,"failed":0}, + "hits": { + "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"}, + "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0), + "hits": $(json_dump "arr:$results") + } + } + EOF + elif [ "$REQUEST_METHOD" = "HEAD" ]; then accept="$(HEADER Accept)" [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ && ctype="${accept}" || ctype="application/json" - sed 's;$;\r;' <<-EOF - HTTP/1.1 200 OK - X-elastic-product: Elasticsearch - content-type: ${ctype} + cat <<-EOF + HTTP/1.1 200 OK + X-elastic-product: Elasticsearch + content-type: ${ctype} + EOF exit 0 @@ -146,15 +249,15 @@ elif [ "$REQUEST_METHOD" = "GET" ]; then [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \ && ctype="${accept}" || ctype="application/json" - sed 's;$;\r;' <<-EOF - HTTP/1.1 200 OK - X-elastic-product: Elasticsearch - content-type: ${ctype} - + cat <<-EOF + HTTP/1.1 200 OK + X-elastic-product: Elasticsearch + content-type: ${ctype} + EOF if [ "$PATH_INFO" = "/${_INDEX}/" ]; then - sed 's;$;\r;' <<-EOF + cat <<-EOF { $(json_dump str:"${_INDEX}"): { "aliases":{}, "mappings": { @@ -185,7 +288,7 @@ elif [ "$REQUEST_METHOD" = "GET" ]; then } EOF else - sed 's;$;\r;' <<-EOF + cat <<-EOF { "name" : "head", "cluster_name" : "elasticsearch", "version" : {