]> git.plutz.net Git - rigidfind/blob - index.cgi
avoid indexing empty content
[rigidfind] / index.cgi
1 #!/bin/sh
2
3 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
4 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
5 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
6
7 [ "$_DATE" ] || _DATE="$(date +%s)"
8
9 debug "$REQUEST_METHOD  $REQUEST_URI    $SERVER_PROTOCOL        $_DATE"
10
11 ingest() {
12   local J="$1"
13
14   # json_get "$J" title
15   # json_get "$J" parts.comments
16
17   case $(json_get "$J" title) in
18     *.md|*.txt|*.csv)
19       printf %s "$content" |base64 -d
20       ;;
21     *.pdf)
22       printf %s "$content" |base64 -d \
23       | pdftotext - -
24       ;;
25     *.doc)
26       printf %s "$content" |base64 -d \
27       | catdoc /dev/stdin
28       ;;
29     *.xls)
30       printf %s "$content" |base64 -d \
31       | xls2csv /dev/stdin
32       ;;
33     *.ppt)
34       printf %s "$content" |base64 -d \
35       | catppt /dev/stdin
36       ;;
37     *.html|*.xml|*.svg)
38       printf %s "$content" |base64 -d \
39       | sed 's;<[^>]*>;;g'
40       ;;
41     *.docx)
42       printf %s "$content" |base64 -d \
43       | unzip -qc /dev/stdin word/document.xml \
44       | head -c 128M | sed 's;<[^>]*>;;g'
45       ;;
46     *.xlsx)
47       printf %s "$content" |base64 -d \
48       | unzip -qc /dev/stdin xl/sharedStrings.xml \
49       | head -c 128M | sed 's;<[^>]*>; ;g'
50       ;;
51     *.odt)
52       printf %s "$content" |base64 -d \
53       | unzip -qc /dev/stdin content.xml \
54       | head -c 128M | sed 's;<[^>]*>;;g'
55       ;;
56     *.ods|*.odp)
57       printf %s "$content" |base64 -d \
58       | unzip -qc /dev/stdin content.xml \
59       | head -c 128M | sed 's;<[^>]*>; ;g'
60       ;;
61     *):;;
62   esac
63 }
64
65 search() {
66   local index="$1" words w num total freq doc date J
67   shift 1; words="$@"
68
69   words="$(printf %s\\n "$words" | awk '
70     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
71             if ( length("ยก") == 1 )  # Utf-8 aware AWK
72             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
73             else                     # UTF-8 Hack
74             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
75             fi
76           }
77       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
78   ')"
79
80   for w in ${words}; do
81     [ ! -f "${index}/$w" ] && continue
82   
83     while read num total freq doc date; do
84       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
85     done <"${index}/$w"
86   done \
87   | awk '
88         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
89     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
90           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
91         }
92   ' \
93   | sort -nr \
94   | while read freq doc; do
95     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
96
97     if J="$(DBM "$_records" get "$doc")"; then
98       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
99       && printf '%f     %s      %s\n' \
100          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
101     fi
102   done
103 }
104
105 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
106 _records="${_DATA}/${_INDEX}/_0_DOCS"
107
108 if   [ "$REQUEST_METHOD" = "PUT" ]; then
109   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
110
111   J="$(head -c "${CONTENT_LENGTH:-0}")"
112   # Don't use json parser to get content field
113   # Content can be very large and the json parser is slow
114   content="$(printf %s\\n "$J" |sed -E '
115     :X; $bY; N; bX; :Y;
116     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
117     s;".*$;;
118     s;\\;;g;
119   ')"
120   J="$(printf %s\\n "$J" |sed -E '
121     :X; $bY; N; bX; :Y;
122     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
123   ')"
124   J="$(json_load "${J}")"
125
126   debug "Content: ${#content} bytes"
127   debug "$(json_dump "$J")"
128
129   if [ "${#content}" -gt 0 ]; then
130     ingest "$J" "$content"\
131     | "${_EXEC}/concordance.sh" \
132       "$_DATA/$_INDEX/" "$(STRING "$_doc")      $_DATE"
133   fi
134
135   J="${J#obj:}"
136   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
137
138   if [ "${#content}" -eq 0 ]; then
139     printf '%s: %s\r\n' "Status" "200 OK"
140     result="updated"
141   elif DBM "$_records" insert "$_doc" "$J"; then
142     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
143     result="created"
144   elif DBM "$_records" update "$_doc" "$J"; then
145     printf '%s: %s\r\n' "Status" "200 OK"
146     result="updated"
147   else
148     printf '%s\r\n' "Status: 500 Internal Server Error" ""
149     exit 0
150   fi
151
152   cat <<-EOF
153         X-elastic-product: Elasticsearch\r
154         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
155         \r
156         { "_index": $(json_dump str:"${_INDEX}"),
157           "_id": $(json_dump str:"$_doc"),
158           "result": "$result",
159           "_indexdate": $_DATE
160         }
161         EOF
162   exit 0
163
164 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
165   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
166
167   if   DBM "$_records" get "$_doc"; then
168     if   DBM "$_records" delete "$_doc"; then
169       printf '%s: %s\r\n' "Status" "200 OK"
170       result="deleted"
171     else
172       printf '%s\r\n' "Status: 500 Internal Server Error" ""
173       exit 0
174     fi
175   else
176     printf '%s: %s\r\n' "Status" "404 Not Found"
177     result="not_found"
178   fi
179
180   cat <<-EOF
181         X-elastic-product: Elasticsearch\r
182         content-type: application/vnd.elasticsearch+json;compatible-with=8\r
183         \r
184         { "_index": $(json_dump str:"${_INDEX}"),
185           "_id": $(json_dump str:"$_doc"),
186           "result": "$result",
187           "_indexdate": $_DATE
188         }
189         EOF
190   exit 0
191
192 elif [ "$REQUEST_METHOD" = "POST" ]; then
193   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
194   J="$(json_get "$J" query.bool.must.bool.should)"
195
196   words="$(
197     for j in $(DB2 "$J" iterate @); do
198       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
199     done 2>/dev/null
200   )"
201
202   results="@    $(
203     search "${_DATA}/${_INDEX}" $words \
204     | while read -r score id source; do
205       S="$(DB2   "" set _index  str:"${_INDEX}")"
206       S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
207       S="$(DB2 "$S" set _score  num:"$score")"
208       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
209       printf 'obj:%s\t' "$(STRING "$S")"
210     done
211   )"
212   results="${results%   }"
213
214   cat <<-EOF
215         Status: 200 OK\r
216         X-elastic-product: Elasticsearch\r
217         Content-Type: application/vnd.elasticsearch+json;compatible-with=8\r
218         \r
219         { "took":0,
220           "timed_out":false,
221           "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
222           "hits": {
223             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
224             "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
225             "hits": $(json_dump "arr:$results")
226           }
227         }
228         EOF
229
230 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
231   accept="$(HEADER Accept)"
232   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
233   && ctype="${accept}" || ctype="application/json"
234
235   cat <<-EOF
236         HTTP/1.1 200 OK\r
237         X-elastic-product: Elasticsearch\r
238         content-type: ${ctype}\r
239         \r
240         EOF
241   exit 0
242
243 elif [ "$REQUEST_METHOD" = "GET" ]; then
244   accept="$(HEADER Accept)"
245   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
246   && ctype="${accept}" || ctype="application/json"
247
248   cat <<-EOF
249         HTTP/1.1 200 OK\r
250         X-elastic-product: Elasticsearch\r
251         content-type: ${ctype}\r
252         \r
253         EOF
254         
255   if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
256   cat <<-EOF
257         { $(json_dump str:"${_INDEX}"): {
258             "aliases":{},
259             "mappings": {
260               "properties": {
261                 "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
262                 "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
263                 "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
264                 "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
265                 "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
266                 "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
267                 "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
268                 "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
269                 "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
270               }
271             },
272             "settings": {
273               "index": {
274                 "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
275                 "number_of_shards":"1",
276                 "provided_name": $(json_dump str:"${_INDEX}"),
277                 "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
278                 "number_of_replicas":"1",
279                 "uuid":"0000000000000000000000",
280                 "version":{"created":"8500010"}
281               }
282             }
283           }
284         }
285         EOF
286   else
287     cat <<-EOF
288         { "name" : "head",
289           "cluster_name" : "elasticsearch",
290           "version" : {
291             "number" : "8.12.1",
292             "lucene_version" : "9.9.2",
293             "minimum_wire_compatibility_version" : "7.17.0",
294             "minimum_index_compatibility_version" : "7.0.0"
295           },
296           "tagline" : "You Know, for Search"
297         }
298         EOF
299   fi
300   exit 0
301
302 else
303   printf '%s\r\n' "Status: 500 Internal Server Error" ""
304   exit 0
305 fi