git.plutz.net Git - rigidfind/blob - index.cgi

   1 #!/bin/sh
   2
   3 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
   4 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
   5 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
   6
   7 [ "$_DATE" ] || _DATE="$(date +%s)"
   8
   9 debug "$REQUEST_METHOD  $REQUEST_URI    $SERVER_PROTOCOL        $_DATE"
  10
  11 ingest() {
  12   local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip"
  13
  14   # json_get "$J" title
  15   # json_get "$J" parts.comments
  16
  17   case $(json_get "$J" title) in
  18     *.md|*.txt|*.csv)
  19       printf %s "$content" |base64 -d
  20       ;;
  21     *.pdf)
  22       printf %s "$content" |base64 -d \
  23       | pdftotext - -
  24       ;;
  25     *.doc)
  26       printf %s "$content" |base64 -d \
  27       | catdoc /dev/stdin
  28       ;;
  29     *.xls)
  30       printf %s "$content" |base64 -d \
  31       | xls2csv /dev/stdin
  32       ;;
  33     *.ppt)
  34       printf %s "$content" |base64 -d \
  35       | catppt /dev/stdin
  36       ;;
  37     *.html|*.xml|*.svg)
  38       printf %s "$content" |base64 -d \
  39       | sed 's;<[^>]*>;;g'
  40       ;;
  41     *.docx)
  42       printf %s "$content" |base64 -d >"$ztmp"
  43       unzip -qc "$ztmp" word/document.xml \
  44       | head -c 128M | sed 's;<[^>]*>;;g'
  45       rm -- "$ztmp"
  46       ;;
  47     *.xlsx)
  48       printf %s "$content" |base64 -d >"$ztmp"
  49       unzip -qc "$ztmp" xl/sharedStrings.xml \
  50       | head -c 128M | sed 's;<[^>]*>; ;g'
  51       rm -- "$ztmp"
  52       ;;
  53     *.odt)
  54       printf %s "$content" |base64 -d >"$ztmp"
  55       unzip -qc "$ztmp" content.xml \
  56       | head -c 128M | sed 's;<[^>]*>;;g'
  57       rm -- "$ztmp"
  58       ;;
  59     *.ods|*.odp)
  60       printf %s "$content" |base64 -d >"$ztmp"
  61       unzip -qc "$ztmp" content.xml \
  62       | head -c 128M | sed 's;<[^>]*>; ;g'
  63       rm -- "$ztmp"
  64       ;;
  65     *):;;
  66   esac
  67 }
  68
  69 search() {
  70   local index="$1" words w num total freq doc date J
  71   shift 1; words="$@"
  72
  73   words="$(printf %s\\n "$words" | awk '
  74     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
  75             if ( length("¡") == 1 )  # Utf-8 aware AWK
  76             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
  77             else                     # UTF-8 Hack
  78             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
  79             fi
  80           }
  81       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
  82   ')"
  83
  84   for w in ${words}; do
  85     [ ! -f "${index}/$w" ] && continue
  86
  87     while read num total freq doc date; do
  88       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
  89     done <"${index}/$w"
  90   done \
  91   | awk '
  92         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
  93     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
  94           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
  95         }
  96   ' \
  97   | sort -nr \
  98   | while read freq doc; do
  99     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
 100
 101     if J="$(DBM "$_records" get "$doc")"; then
 102       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
 103       && printf '%f     %s      %s\n' \
 104          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
 105     fi
 106   done
 107 }
 108
 109 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
 110 _records="${_DATA}/${_INDEX}/_0_DOCS"
 111
 112 if [ "${INDEX}" -a ! -d "${_DATA}/${_INDEX}" ]; then
 113   printf '%s\r\n' "Status: 404 Not Found" ""
 114   exit 0
 115 elif authlist="$(DBM "${_DATA}/auth.db" get "${_INDEX}" )"; then
 116   auth="$(HEADER Authorization)" auth="${auth#Basic }"
 117   for a in $authlist deny; do
 118     [ "$auth" = "$a" ] && break
 119   done
 120   if [ "$a" = "deny" -o ! "$auth" ]; then
 121     printf '%s\r\n' "Status: 401 Unauthorized" \
 122       "WWW-Authenticate: Basic realm=\"Rigid Find\"" "" \
 123     | debug
 124     exit 0
 125   fi
 126   unset a auth authlist
 127 fi
 128
 129 if   [ "$REQUEST_METHOD" = "PUT" ]; then
 130   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
 131
 132   J="$(head -c "${CONTENT_LENGTH:-0}")"
 133   # Don't use json parser to get content field
 134   # Content can be very large and the json parser is slow
 135   content="$(printf %s\\n "$J" |sed -E '
 136     :X; $bY; N; bX; :Y;
 137     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
 138     s;".*$;;
 139     s;\\;;g;
 140   ')"
 141   J="$(printf %s\\n "$J" |sed -E '
 142     :X; $bY; N; bX; :Y;
 143     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
 144   ')"
 145   J="$(json_load "${J}")"
 146
 147   debug "Content: ${#content} bytes"
 148   debug "$(json_dump "$J")"
 149
 150   if [ "${#content}" -gt 0 ]; then
 151     ingest "$J" "$content"\
 152     | "${_EXEC}/concordance.sh" \
 153       "$_DATA/$_INDEX/" "$(STRING "$_doc")      $_DATE"
 154   fi
 155
 156   J="${J#obj:}"
 157   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
 158
 159   if [ "${#content}" -eq 0 ]; then
 160     printf '%s: %s\r\n' "Status" "200 OK"
 161     result="updated"
 162   elif DBM "$_records" insert "$_doc" "$J"; then
 163     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
 164     result="created"
 165   elif DBM "$_records" update "$_doc" "$J"; then
 166     printf '%s: %s\r\n' "Status" "200 OK"
 167     result="updated"
 168   else
 169     printf '%s\r\n' "Status: 500 Internal Server Error" ""
 170     exit 0
 171   fi
 172
 173   cat <<-EOF
 174         X-elastic-product: Elasticsearch\r
 175         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
 176         \r
 177         { "_index": $(json_dump str:"${_INDEX}"),
 178           "_id": $(json_dump str:"$_doc"),
 179           "result": "$result",
 180           "_indexdate": $_DATE
 181         }
 182         EOF
 183   exit 0
 184
 185 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
 186   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
 187
 188   if   DBM "$_records" get "$_doc"; then
 189     if   DBM "$_records" delete "$_doc"; then
 190       printf '%s: %s\r\n' "Status" "200 OK"
 191       result="deleted"
 192     else
 193       printf '%s\r\n' "Status: 500 Internal Server Error" ""
 194       exit 0
 195     fi
 196   else
 197     printf '%s: %s\r\n' "Status" "404 Not Found"
 198     result="not_found"
 199   fi
 200
 201   cat <<-EOF
 202         X-elastic-product: Elasticsearch\r
 203         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
 204         \r
 205         { "_index": $(json_dump str:"${_INDEX}"),
 206           "_id": $(json_dump str:"$_doc"),
 207           "result": "$result",
 208           "_indexdate": $_DATE
 209         }
 210         EOF
 211   exit 0
 212
 213 elif [ "$REQUEST_METHOD" = "POST" ]; then
 214   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
 215   J="$(json_get "$J" query.bool.must.bool.should)"
 216
 217   words="$(
 218     for j in $(DB2 "$J" iterate @); do
 219       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
 220     done 2>/dev/null
 221   )"
 222   debug "Search words: $words"
 223
 224   results="@    $(
 225     search "${_DATA}/${_INDEX}" $words \
 226     | while read -r score id source; do
 227       debug "Hit: $id   $score"
 228       S="$(DB2   "" set _index  str:"${_INDEX}")"
 229       S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
 230       S="$(DB2 "$S" set _score  num:"$score")"
 231       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
 232       printf 'obj:%s\t' "$(STRING "$S")"
 233     done
 234   )"
 235   results="${results%   }"
 236
 237   cat <<-EOF
 238         Status: 200 OK\r
 239         X-elastic-product: Elasticsearch\r
 240         Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
 241         \r
 242         { "took":0,
 243           "timed_out":false,
 244           "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
 245           "hits": {
 246             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
 247             "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
 248             "hits": $(json_dump "arr:$results")
 249           }
 250         }
 251         EOF
 252
 253 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
 254   accept="$(HEADER Accept)"
 255   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
 256   && ctype="${accept}" || ctype="application/json"
 257
 258   cat <<-EOF
 259         Status: 200 OK\r
 260         X-elastic-product: Elasticsearch\r
 261         content-type: ${ctype}\r
 262         \r
 263         EOF
 264   exit 0
 265
 266 else
 267   # elif [ "$REQUEST_METHOD" = "GET" ]; then
 268   cat <<-EOF
 269         Status: 501 Not Implemented\r
 270         X-elastic-product: Elasticsearch\r
 271         content-type: text/plain\r
 272         \r
 273         Use the Nextcloud Elastic Search Plugin to use this service.
 274         EOF
 275   exit 0
 276 fi