git.plutz.net Git - rigidfind/blob - index.cgi

   1 #!/bin/sh
   2
   3 read _DATE _date_n <<-EOF
   4         $(date +"%s     %N")
   5         EOF
   6
   7 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
   8 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
   9 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
  10
  11 debug "$REQUEST_METHOD  $REQUEST_URI    $SERVER_PROTOCOL        $_DATE"
  12
  13 ingest() {
  14   local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip"
  15
  16   # json_get "$J" title
  17   # json_get "$J" parts.comments
  18
  19   case $(json_get "$J" title) in
  20     *.md|*.txt|*.csv)
  21       printf %s "$content" |base64 -d
  22       ;;
  23     *.pdf)
  24       printf %s "$content" |base64 -d \
  25       | pdftotext - -
  26       ;;
  27     *.doc)
  28       printf %s "$content" |base64 -d \
  29       | catdoc /dev/stdin
  30       ;;
  31     *.xls)
  32       printf %s "$content" |base64 -d \
  33       | xls2csv /dev/stdin
  34       ;;
  35     *.ppt)
  36       printf %s "$content" |base64 -d \
  37       | catppt /dev/stdin
  38       ;;
  39     *.html|*.xml|*.svg)
  40       printf %s "$content" |base64 -d \
  41       | sed 's;<[^>]*>;;g'
  42       ;;
  43     *.docx)
  44       printf %s "$content" |base64 -d >"$ztmp"
  45       unzip -qc "$ztmp" word/document.xml \
  46       | head -c 128M | sed 's;<[^>]*>;;g'
  47       rm -- "$ztmp"
  48       ;;
  49     *.xlsx)
  50       printf %s "$content" |base64 -d >"$ztmp"
  51       unzip -qc "$ztmp" xl/sharedStrings.xml \
  52       | head -c 128M | sed 's;<[^>]*>; ;g'
  53       rm -- "$ztmp"
  54       ;;
  55     *.odt)
  56       printf %s "$content" |base64 -d >"$ztmp"
  57       unzip -qc "$ztmp" content.xml \
  58       | head -c 128M | sed 's;<[^>]*>;;g'
  59       rm -- "$ztmp"
  60       ;;
  61     *.ods|*.odp)
  62       printf %s "$content" |base64 -d >"$ztmp"
  63       unzip -qc "$ztmp" content.xml \
  64       | head -c 128M | sed 's;<[^>]*>; ;g'
  65       rm -- "$ztmp"
  66       ;;
  67     *):;;
  68   esac
  69 }
  70
  71 search() {
  72   local index="$1" words w num total freq doc date J
  73   shift 1; words="$@"
  74
  75   words="$(printf %s\\n "$words" | awk '
  76     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
  77             if ( length("¡") == 1 )  # Utf-8 aware AWK
  78             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
  79             else                     # UTF-8 Hack
  80             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
  81             fi
  82           }
  83       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
  84   ')"
  85
  86   for w in ${words}; do
  87     [ ! -f "${index}/$w" ] && continue
  88
  89     while read num total freq doc date; do
  90       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
  91     done <"${index}/$w"
  92   done \
  93   | awk '
  94         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
  95     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
  96           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
  97         }
  98   ' \
  99   | sort -nr \
 100   | while read freq doc; do
 101     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
 102
 103     if J="$(DBM "$_records" get "$doc")"; then
 104       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
 105       && printf '%f     %s      %s\n' \
 106          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
 107     fi
 108   done
 109 }
 110
 111 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
 112 _records="${_DATA}/${_INDEX}/_0_DOCS"
 113
 114 if [ "${INDEX}" -a ! -d "${_DATA}/${_INDEX}" ]; then
 115   printf '%s\r\n' "Status: 404 Not Found" ""
 116   exit 0
 117 elif authlist="$(DBM "${_DATA}/auth.db" get "${_INDEX}" )"; then
 118   auth="$(HEADER Authorization)" auth="${auth#Basic }"
 119   for a in $authlist deny; do
 120     [ "$auth" = "$a" ] && break
 121   done
 122   if [ "$a" = "deny" -o ! "$auth" ]; then
 123     printf '%s\r\n' "Status: 401 Unauthorized" \
 124       "WWW-Authenticate: Basic realm=\"Rigid Find\"" "" \
 125     | debug
 126     exit 0
 127   fi
 128   unset a auth authlist
 129 fi
 130
 131 if   [ "$REQUEST_METHOD" = "PUT" ]; then
 132   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
 133
 134   J="$(head -c "${CONTENT_LENGTH:-0}")"
 135   # Don't use json parser to get content field
 136   # Content can be very large and the json parser is slow
 137   content="$(printf %s\\n "$J" |sed -E '
 138     :X; $bY; N; bX; :Y;
 139     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
 140     s;".*$;;
 141     s;\\;;g;
 142   ')"
 143   J="$(printf %s\\n "$J" |sed -E '
 144     :X; $bY; N; bX; :Y;
 145     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
 146   ')"
 147   J="$(json_load "${J}")"
 148
 149   debug "Content: ${#content} bytes"
 150   debug "$(json_dump "$J")"
 151
 152   if [ "${#content}" -gt 0 ]; then
 153     ingest "$J" "$content"\
 154     | "${_EXEC}/concordance.sh" \
 155       "$_DATA/$_INDEX/" "$(STRING "$_doc")      $_DATE"
 156   fi
 157
 158   J="${J#obj:}"
 159   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
 160
 161   if [ "${#content}" -eq 0 ]; then
 162     printf '%s: %s\r\n' "Status" "200 OK"
 163     result="updated"
 164   elif DBM "$_records" insert "$_doc" "$J"; then
 165     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
 166     result="created"
 167   elif DBM "$_records" update "$_doc" "$J"; then
 168     printf '%s: %s\r\n' "Status" "200 OK"
 169     result="updated"
 170   else
 171     printf '%s\r\n' "Status: 500 Internal Server Error" ""
 172     exit 0
 173   fi
 174
 175   cat <<-EOF
 176         X-elastic-product: Elasticsearch\r
 177         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
 178         \r
 179         { "_index": $(json_dump str:"${_INDEX}"),
 180           "_id": $(json_dump str:"$_doc"),
 181           "result": "$result",
 182           "_indexdate": $_DATE
 183         }
 184         EOF
 185   exit 0
 186
 187 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
 188   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
 189
 190   if   DBM "$_records" get "$_doc"; then
 191     if   DBM "$_records" delete "$_doc"; then
 192       printf '%s: %s\r\n' "Status" "200 OK"
 193       result="deleted"
 194     else
 195       printf '%s\r\n' "Status: 500 Internal Server Error" ""
 196       exit 0
 197     fi
 198   else
 199     printf '%s: %s\r\n' "Status" "404 Not Found"
 200     result="not_found"
 201   fi
 202
 203   cat <<-EOF
 204         X-elastic-product: Elasticsearch\r
 205         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
 206         \r
 207         { "_index": $(json_dump str:"${_INDEX}"),
 208           "_id": $(json_dump str:"$_doc"),
 209           "result": "$result",
 210           "_indexdate": $_DATE
 211         }
 212         EOF
 213   exit 0
 214
 215 elif [ "$REQUEST_METHOD" = "POST" ]; then
 216   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
 217   J="$(json_get "$J" query.bool.must.bool.should)"
 218
 219   words="$(
 220     for j in $(DB2 "$J" iterate @); do
 221       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
 222     done 2>/dev/null |tr \\n ' '
 223   )"
 224   debug "Search words: $words"
 225
 226   results="@    $(
 227     search "${_DATA}/${_INDEX}" $words \
 228     | while read -r score id source; do
 229       debug "Hit: $id   $score"
 230       S="$(DB2   "" set _index  str:"${_INDEX}")"
 231       S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
 232       S="$(DB2 "$S" set _score  num:"$score")"
 233       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
 234       printf 'obj:%s\t' "$(STRING "$S")"
 235     done
 236   )"
 237   results="${results%   }"
 238
 239   t="$(( ${_DATE}${_date_n} - $(date +%s%N) ))"
 240
 241   cat <<-EOF
 242         Status: 200 OK\r
 243         X-elastic-product: Elasticsearch\r
 244         Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
 245         \r
 246         { "took":$((t / 1000)),
 247           "timed_out":false,
 248           "hits": {
 249             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
 250             "max_score": $(json_get "arr:$results" '[0]._score' 2>/dev/null || printf 0),
 251             "hits": $(json_dump "arr:$results")
 252           }
 253         }
 254         EOF
 255
 256 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
 257   accept="$(HEADER Accept)"
 258   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
 259   && ctype="${accept}" || ctype="application/json"
 260
 261   cat <<-EOF
 262         Status: 200 OK\r
 263         X-elastic-product: Elasticsearch\r
 264         content-type: ${ctype}\r
 265         \r
 266         EOF
 267   exit 0
 268
 269 else
 270   # elif [ "$REQUEST_METHOD" = "GET" ]; then
 271   cat <<-EOF
 272         Status: 501 Not Implemented\r
 273         X-elastic-product: Elasticsearch\r
 274         content-type: text/plain\r
 275         \r
 276         Use the Nextcloud Elastic Search Plugin to use this service.
 277         EOF
 278   exit 0
 279 fi