]> git.plutz.net Git - rigidfind/blob - index.cgi
5351b2b27e3d42ed17a4124b79647b2c78e294ad
[rigidfind] / index.cgi
1 #!/bin/sh
2
3 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
4 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
5 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
6
7 [ "$_DATE" ] || _DATE="$(date +%s)"
8
9 debug "$REQUEST_METHOD  $REQUEST_URI    $SERVER_PROTOCOL        $_DATE"
10
11 ingest() {
12   local J="$1" ztmp="${TMP:-/tmp}/zipfile_$$.zip"
13
14   # json_get "$J" title
15   # json_get "$J" parts.comments
16
17   case $(json_get "$J" title) in
18     *.md|*.txt|*.csv)
19       printf %s "$content" |base64 -d
20       ;;
21     *.pdf)
22       printf %s "$content" |base64 -d \
23       | pdftotext - -
24       ;;
25     *.doc)
26       printf %s "$content" |base64 -d \
27       | catdoc /dev/stdin
28       ;;
29     *.xls)
30       printf %s "$content" |base64 -d \
31       | xls2csv /dev/stdin
32       ;;
33     *.ppt)
34       printf %s "$content" |base64 -d \
35       | catppt /dev/stdin
36       ;;
37     *.html|*.xml|*.svg)
38       printf %s "$content" |base64 -d \
39       | sed 's;<[^>]*>;;g'
40       ;;
41     *.docx)
42       printf %s "$content" |base64 -d >"$ztmp"
43       unzip -qc "$ztmp" word/document.xml \
44       | head -c 128M | sed 's;<[^>]*>;;g'
45       rm -- "$ztmp"
46       ;;
47     *.xlsx)
48       printf %s "$content" |base64 -d >"$ztmp"
49       unzip -qc "$ztmp" xl/sharedStrings.xml \
50       | head -c 128M | sed 's;<[^>]*>; ;g'
51       rm -- "$ztmp"
52       ;;
53     *.odt)
54       printf %s "$content" |base64 -d >"$ztmp"
55       unzip -qc "$ztmp" content.xml \
56       | head -c 128M | sed 's;<[^>]*>;;g'
57       rm -- "$ztmp"
58       ;;
59     *.ods|*.odp)
60       printf %s "$content" |base64 -d >"$ztmp"
61       unzip -qc "$ztmp" content.xml \
62       | head -c 128M | sed 's;<[^>]*>; ;g'
63       rm -- "$ztmp"
64       ;;
65     *):;;
66   esac
67 }
68
69 search() {
70   local index="$1" words w num total freq doc date J
71   shift 1; words="$@"
72
73   words="$(printf %s\\n "$words" | awk '
74     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
75             if ( length("ยก") == 1 )  # Utf-8 aware AWK
76             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
77             else                     # UTF-8 Hack
78             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
79             fi
80           }
81       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
82   ')"
83
84   for w in ${words}; do
85     [ ! -f "${index}/$w" ] && continue
86   
87     while read num total freq doc date; do
88       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
89     done <"${index}/$w"
90   done \
91   | awk '
92         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
93     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
94           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
95         }
96   ' \
97   | sort -nr \
98   | while read freq doc; do
99     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
100
101     if J="$(DBM "$_records" get "$doc")"; then
102       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
103       && printf '%f     %s      %s\n' \
104          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
105     fi
106   done
107 }
108
109 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
110 _records="${_DATA}/${_INDEX}/_0_DOCS"
111
112 if [ "${INDEX}" -a ! -d "${_DATA}/${_INDEX}" ]; then
113   printf '%s\r\n' "Status: 404 Not Found" ""
114   exit 0
115 elif authlist="$(DBM "${_DATA}/auth.db" get "${_INDEX}" )"; then
116   auth="$(HEADER Authorization)" auth="${auth#Basic }"
117   for a in $authlist deny; do
118     [ "$auth" = "$a" ] && break
119   done
120   if [ "$a" = "deny" -o ! "$auth" ]; then
121     printf '%s\r\n' "Status: 401 Unauthorized" \
122       "WWW-Authenticate: Basic realm=\"Rigid Find\"" "" \
123     | debug
124     exit 0
125   fi
126   unset a auth authlist
127 fi
128
129 if   [ "$REQUEST_METHOD" = "PUT" ]; then
130   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
131
132   J="$(head -c "${CONTENT_LENGTH:-0}")"
133   # Don't use json parser to get content field
134   # Content can be very large and the json parser is slow
135   content="$(printf %s\\n "$J" |sed -E '
136     :X; $bY; N; bX; :Y;
137     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
138     s;".*$;;
139     s;\\;;g;
140   ')"
141   J="$(printf %s\\n "$J" |sed -E '
142     :X; $bY; N; bX; :Y;
143     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
144   ')"
145   J="$(json_load "${J}")"
146
147   debug "Content: ${#content} bytes"
148   debug "$(json_dump "$J")"
149
150   if [ "${#content}" -gt 0 ]; then
151     ingest "$J" "$content"\
152     | "${_EXEC}/concordance.sh" \
153       "$_DATA/$_INDEX/" "$(STRING "$_doc")      $_DATE"
154   fi
155
156   J="${J#obj:}"
157   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
158
159   if [ "${#content}" -eq 0 ]; then
160     printf '%s: %s\r\n' "Status" "200 OK"
161     result="updated"
162   elif DBM "$_records" insert "$_doc" "$J"; then
163     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
164     result="created"
165   elif DBM "$_records" update "$_doc" "$J"; then
166     printf '%s: %s\r\n' "Status" "200 OK"
167     result="updated"
168   else
169     printf '%s\r\n' "Status: 500 Internal Server Error" ""
170     exit 0
171   fi
172
173   cat <<-EOF
174         X-elastic-product: Elasticsearch\r
175         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
176         \r
177         { "_index": $(json_dump str:"${_INDEX}"),
178           "_id": $(json_dump str:"$_doc"),
179           "result": "$result",
180           "_indexdate": $_DATE
181         }
182         EOF
183   exit 0
184
185 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
186   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
187
188   if   DBM "$_records" get "$_doc"; then
189     if   DBM "$_records" delete "$_doc"; then
190       printf '%s: %s\r\n' "Status" "200 OK"
191       result="deleted"
192     else
193       printf '%s\r\n' "Status: 500 Internal Server Error" ""
194       exit 0
195     fi
196   else
197     printf '%s: %s\r\n' "Status" "404 Not Found"
198     result="not_found"
199   fi
200
201   cat <<-EOF
202         X-elastic-product: Elasticsearch\r
203         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
204         \r
205         { "_index": $(json_dump str:"${_INDEX}"),
206           "_id": $(json_dump str:"$_doc"),
207           "result": "$result",
208           "_indexdate": $_DATE
209         }
210         EOF
211   exit 0
212
213 elif [ "$REQUEST_METHOD" = "POST" ]; then
214   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}" |debug)")"
215   J="$(json_get "$J" query.bool.must.bool.should)"
216
217   words="$(
218     for j in $(DB2 "$J" iterate @); do
219       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
220     done 2>/dev/null
221   )"
222   debug "Search words: $words"
223
224   results="@    $(
225     search "${_DATA}/${_INDEX}" $words \
226     | while read -r score id source; do
227       debug "Hit: $id   $score"
228       S="$(DB2   "" set _index  str:"${_INDEX}")"
229       S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
230       S="$(DB2 "$S" set _score  num:"$score")"
231       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
232       printf 'obj:%s\t' "$(STRING "$S")"
233     done
234   )"
235   results="${results%   }"
236
237   cat <<-EOF
238         Status: 200 OK\r
239         X-elastic-product: Elasticsearch\r
240         Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
241         \r
242         { "took":0,
243           "timed_out":false,
244           "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
245           "hits": {
246             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
247             "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
248             "hits": $(json_dump "arr:$results")
249           }
250         }
251         EOF
252
253 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
254   accept="$(HEADER Accept)"
255   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
256   && ctype="${accept}" || ctype="application/json"
257
258   cat <<-EOF
259         Status: 200 OK\r
260         X-elastic-product: Elasticsearch\r
261         content-type: ${ctype}\r
262         \r
263         EOF
264   exit 0
265
266 elif [ "$REQUEST_METHOD" = "GET" ]; then
267   accept="$(HEADER Accept)"
268   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
269   && ctype="${accept}" || ctype="application/json"
270
271   cat <<-EOF
272         Status: 200 OK\r
273         X-elastic-product: Elasticsearch\r
274         content-type: ${ctype}\r
275         \r
276         EOF
277         
278   if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
279   cat <<-EOF
280         { $(json_dump str:"${_INDEX}"): {
281             "aliases":{},
282             "mappings": {
283               "properties": {
284                 "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
285                 "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
286                 "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
287                 "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
288                 "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
289                 "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
290                 "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
291                 "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
292                 "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
293               }
294             },
295             "settings": {
296               "index": {
297                 "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
298                 "number_of_shards":"1",
299                 "provided_name": $(json_dump str:"${_INDEX}"),
300                 "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
301                 "number_of_replicas":"1",
302                 "uuid":"0000000000000000000000",
303                 "version":{"created":"8500010"}
304               }
305             }
306           }
307         }
308         EOF
309   else
310     cat <<-EOF
311         { "name" : "head",
312           "cluster_name" : "elasticsearch",
313           "version" : {
314             "number" : "8.12.1",
315             "lucene_version" : "9.9.2",
316             "minimum_wire_compatibility_version" : "7.17.0",
317             "minimum_index_compatibility_version" : "7.0.0"
318           },
319           "tagline" : "You Know, for Search"
320         }
321         EOF
322   fi
323   exit 0
324
325 else
326   printf '%s\r\n' "Status: 500 Internal Server Error" ""
327   exit 0
328 fi