X-Git-Url: https://git.plutz.net//?a=blobdiff_plain;f=index.cgi;h=0078a2f8f1837201ae7fc7a19324cd0fe01d4818;hb=092bfad64ae6b2022991f6193f042d6d53bf10c2;hp=cb325ea4a8e78b366d6e07e3ccade0e71f9191e4;hpb=b812c21251a6dca0f419869c773d86cfc1bd3942;p=rigidfind

diff --git a/index.cgi b/index.cgi
index cb325ea..0078a2f 100755
--- a/index.cgi
+++ b/index.cgi
@@ -6,77 +6,143 @@
 
 [ "$_DATE" ] || _DATE="$(date +%s)"
 
-_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
-_records="${_DATA}/${_INDEX}/_0_DOCS"
+debug "$REQUEST_METHOD	$REQUEST_URI	$SERVER_PROTOCOL	$_DATE"
 
 ingest() {
-  local J="$1"
+  local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip"
 
   # json_get "$J" title
   # json_get "$J" parts.comments
 
   case $(json_get "$J" title) in
     *.md|*.txt|*.csv)
-      json_get "$J" content |base64 -d
+      printf %s "$content" |base64 -d
       ;;
     *.pdf)
-      json_get "$J" content |base64 -d \
-      | pdftotext -
+      printf %s "$content" |base64 -d \
+      | pdftotext - -
       ;;
     *.doc)
-      json_get "$J" content |base64 -d \
+      printf %s "$content" |base64 -d \
       | catdoc /dev/stdin
       ;;
     *.xls)
-      json_get "$J" content |base64 -d \
+      printf %s "$content" |base64 -d \
       | xls2csv /dev/stdin
       ;;
     *.ppt)
-      json_get "$J" content |base64 -d \
+      printf %s "$content" |base64 -d \
       | catppt /dev/stdin
       ;;
     *.html|*.xml|*.svg)
-      json_get "$J" content |base64 -d \
+      printf %s "$content" |base64 -d \
       | sed 's;<[^>]*>;;g'
       ;;
     *.docx)
-      json_get "$J" content |base64 -d \
-      | unzip -qc /dev/stdin word/document.xml \
+      printf %s "$content" |base64 -d >"$ztmp"
+      unzip -qc "$ztmp" word/document.xml \
       | head -c 128M | sed 's;<[^>]*>;;g'
+      rm -- "$ztmp"
       ;;
     *.xlsx)
-      json_get "$J" content |base64 -d \
-      | unzip -qc /dev/stdin xl/sharedStrings.xml \
+      printf %s "$content" |base64 -d >"$ztmp"
+      unzip -qc "$ztmp" xl/sharedStrings.xml \
       | head -c 128M | sed 's;<[^>]*>; ;g'
+      rm -- "$ztmp"
       ;;
     *.odt)
-      json_get "$J" content |base64 -d \
-      | unzip -qc /dev/stdin content.xml \
+      printf %s "$content" |base64 -d >"$ztmp"
+      unzip -qc "$ztmp" content.xml \
       | head -c 128M | sed 's;<[^>]*>;;g'
+      rm -- "$ztmp"
       ;;
     *.ods|*.odp)
-      json_get "$J" content |base64 -d \
-      | unzip -qc /dev/stdin content.xml \
+      printf %s "$content" |base64 -d >"$ztmp"
+      unzip -qc "$ztmp" content.xml \
       | head -c 128M | sed 's;<[^>]*>; ;g'
+      rm -- "$ztmp"
       ;;
     *):;;
   esac
 }
 
+search() {
+  local index="$1" words w num total freq doc date J
+  shift 1; words="$@"
+
+  words="$(printf %s\\n "$words" | awk '
+    BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+            if ( length("Â¡") == 1 )  # Utf-8 aware AWK
+            FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+            else                     # UTF-8 Hack
+            FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+            fi
+          }
+      { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
+  ')"
+
+  for w in ${words}; do
+    [ ! -f "${index}/$w" ] && continue
+  
+    while read num total freq doc date; do
+      printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
+    done <"${index}/$w"
+  done \
+  | awk '
+        { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
+    END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
+          for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
+        }
+  ' \
+  | sort -nr \
+  | while read freq doc; do
+    date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
+
+    if J="$(DBM "$_records" get "$doc")"; then
+      [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
+      && printf '%f	%s	%s\n' \
+         "$freq" "$(STRING "$doc")" "$(STRING "$J")"
+    fi
+  done
+}
+
+_INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
+_records="${_DATA}/${_INDEX}/_0_DOCS"
+
 if   [ "$REQUEST_METHOD" = "PUT" ]; then
   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
 
-  J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
-  
-  ingest "$J" \
-  | "${_EXEC}/concordance.sh" \
-    "$_DATA/$_INDEX/" "$(STRING "$_doc")	$_DATE"
+  J="$(head -c "${CONTENT_LENGTH:-0}")"
+  # Don't use json parser to get content field
+  # Content can be very large and the json parser is slow
+  content="$(printf %s\\n "$J" |sed -E '
+    :X; $bY; N; bX; :Y;
+    s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
+    s;".*$;;
+    s;\\;;g;
+  ')"
+  J="$(printf %s\\n "$J" |sed -E '
+    :X; $bY; N; bX; :Y;
+    s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
+  ')"
+  J="$(json_load "${J}")"
+
+  debug "Content: ${#content} bytes"
+  debug "$(json_dump "$J")"
+
+  if [ "${#content}" -gt 0 ]; then
+    ingest "$J" "$content"\
+    | "${_EXEC}/concordance.sh" \
+      "$_DATA/$_INDEX/" "$(STRING "$_doc")	$_DATE"
+  fi
 
   J="${J#obj:}"
-  J="$(DB2 "$J" delete content)"
   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
 
-  if   DBM "$_records" insert "$_doc" "$J"; then
+  if [ "${#content}" -eq 0 ]; then
+    printf '%s: %s\r\n' "Status" "200 OK"
+    result="updated"
+  elif DBM "$_records" insert "$_doc" "$J"; then
     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
     result="created"
   elif DBM "$_records" update "$_doc" "$J"; then
@@ -87,10 +153,10 @@ if   [ "$REQUEST_METHOD" = "PUT" ]; then
     exit 0
   fi
 
-  sed 's;$;\r;' <<-EOF
-	X-elastic-product: Elasticsearch
-	content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+  cat <<-EOF
+	X-elastic-product: Elasticsearch
+	content-type: application/vnd.elasticsearch+json;compatible-with=8
+	
 	{ "_index": $(json_dump str:"${_INDEX}"),
 	  "_id": $(json_dump str:"$_doc"),
 	  "result": "$result",
@@ -115,10 +181,10 @@ elif [ "$REQUEST_METHOD" = "DELETE" ]; then
     result="not_found"
   fi
 
-  sed 's;$;\r;' <<-EOF
-	X-elastic-product: Elasticsearch
-	content-type: application/vnd.elasticsearch+json;compatible-with=8
-
+  cat <<-EOF
+	X-elastic-product: Elasticsearch
+	content-type: application/vnd.elasticsearch+json;compatible-with=8
+	
 	{ "_index": $(json_dump str:"${_INDEX}"),
 	  "_id": $(json_dump str:"$_doc"),
 	  "result": "$result",
@@ -128,16 +194,53 @@ elif [ "$REQUEST_METHOD" = "DELETE" ]; then
   exit 0
 
 elif [ "$REQUEST_METHOD" = "POST" ]; then
-  :
+  J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
+  J="$(json_get "$J" query.bool.must.bool.should)"
+
+  words="$(
+    for j in $(DB2 "$J" iterate @); do
+      json_get "$(UNSTRING "$j")" match_phrase_prefix.content
+    done 2>/dev/null
+  )"
+
+  results="@	$(
+    search "${_DATA}/${_INDEX}" $words \
+    | while read -r score id source; do
+      S="$(DB2   "" set _index  str:"${_INDEX}")"
+      S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
+      S="$(DB2 "$S" set _score  num:"$score")"
+      S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
+      printf 'obj:%s\t' "$(STRING "$S")"
+    done
+  )"
+  results="${results%	}"
+
+  cat <<-EOF
+	Status: 200 OK
+	X-elastic-product: Elasticsearch
+	Content-Type: application/vnd.elasticsearch+json;compatible-with=8
+	
+	{ "took":0,
+	  "timed_out":false,
+	  "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
+	  "hits": {
+	    "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
+	    "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
+	    "hits": $(json_dump "arr:$results")
+	  }
+	}
+	EOF
+
 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
   accept="$(HEADER Accept)"
   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
   && ctype="${accept}" || ctype="application/json"
 
-  sed 's;$;\r;' <<-EOF
-	HTTP/1.1 200 OK
-	X-elastic-product: Elasticsearch
-	content-type: ${ctype}
+  cat <<-EOF
+	HTTP/1.1 200 OK
+	X-elastic-product: Elasticsearch
+	content-type: ${ctype}
+	
 	EOF
   exit 0
 
@@ -146,15 +249,15 @@ elif [ "$REQUEST_METHOD" = "GET" ]; then
   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
   && ctype="${accept}" || ctype="application/json"
 
-  sed 's;$;\r;' <<-EOF
-	HTTP/1.1 200 OK
-	X-elastic-product: Elasticsearch
-	content-type: ${ctype}
-
+  cat <<-EOF
+	HTTP/1.1 200 OK
+	X-elastic-product: Elasticsearch
+	content-type: ${ctype}
+	
 	EOF
 	
   if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
-  sed 's;$;\r;' <<-EOF
+  cat <<-EOF
 	{ $(json_dump str:"${_INDEX}"): {
 	    "aliases":{},
 	    "mappings": {
@@ -185,7 +288,7 @@ elif [ "$REQUEST_METHOD" = "GET" ]; then
 	}
 	EOF
   else
-    sed 's;$;\r;' <<-EOF
+    cat <<-EOF
 	{ "name" : "head",
 	  "cluster_name" : "elasticsearch",
 	  "version" : {