]> git.plutz.net Git - rigidfind/blob - index.cgi
d4f4cec6f8ed573ce27032ccdf9b0467798d0fee
[rigidfind] / index.cgi
1 #!/bin/sh
2
3 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
4 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
5 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
6
7 [ "$_DATE" ] || _DATE="$(date +%s)"
8
9 debug "$REQUEST_METHOD  $REQUEST_URI    $SERVER_PROTOCOL"
10
11 ingest() {
12   local J="$1"
13
14   # json_get "$J" title
15   # json_get "$J" parts.comments
16
17   case $(json_get "$J" title) in
18     *.md|*.txt|*.csv)
19       printf %s "$content" |base64 -d
20       ;;
21     *.pdf)
22       printf %s "$content" |base64 -d \
23       | pdftotext - -
24       ;;
25     *.doc)
26       printf %s "$content" |base64 -d \
27       | catdoc /dev/stdin
28       ;;
29     *.xls)
30       printf %s "$content" |base64 -d \
31       | xls2csv /dev/stdin
32       ;;
33     *.ppt)
34       printf %s "$content" |base64 -d \
35       | catppt /dev/stdin
36       ;;
37     *.html|*.xml|*.svg)
38       printf %s "$content" |base64 -d \
39       | sed 's;<[^>]*>;;g'
40       ;;
41     *.docx)
42       printf %s "$content" |base64 -d \
43       | unzip -qc /dev/stdin word/document.xml \
44       | head -c 128M | sed 's;<[^>]*>;;g'
45       ;;
46     *.xlsx)
47       printf %s "$content" |base64 -d \
48       | unzip -qc /dev/stdin xl/sharedStrings.xml \
49       | head -c 128M | sed 's;<[^>]*>; ;g'
50       ;;
51     *.odt)
52       printf %s "$content" |base64 -d \
53       | unzip -qc /dev/stdin content.xml \
54       | head -c 128M | sed 's;<[^>]*>;;g'
55       ;;
56     *.ods|*.odp)
57       printf %s "$content" |base64 -d \
58       | unzip -qc /dev/stdin content.xml \
59       | head -c 128M | sed 's;<[^>]*>; ;g'
60       ;;
61     *):;;
62   esac
63 }
64
65 search() {
66   local index="$1" words w num total freq doc date J
67   shift 1; words="$@"
68
69   words="$(printf %s\\n "$words" | awk '
70     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
71             if ( length("ยก") == 1 )  # Utf-8 aware AWK
72             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
73             else                     # UTF-8 Hack
74             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
75             fi
76           }
77       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
78   ')"
79
80   for w in ${words}; do
81     [ ! -f "${index}/$w" ] && continue
82   
83     while read num total freq doc date; do
84       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
85     done <"${index}/$w"
86   done \
87   | awk '
88         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
89     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
90           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
91         }
92   ' \
93   | sort -nr \
94   | while read freq doc; do
95     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
96
97     if J="$(DBM "$_records" get "$doc")"; then
98       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
99       && printf '%f     %s      %s\n' \
100          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
101     fi
102   done
103 }
104
105 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
106 _records="${_DATA}/${_INDEX}/_0_DOCS"
107
108 if   [ "$REQUEST_METHOD" = "PUT" ]; then
109   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
110
111   J="$(head -c "${CONTENT_LENGTH:-0}")"
112   # Don't use json parser to get content field
113   # Content can be very large and the json parser is slow
114   content="$(printf %s\\n "$J" |sed -E '
115     :X; $bY; N; bX; :Y;
116     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
117     s;".*$;;
118     s;\\;;g;
119   ')"
120   J="$(printf %s\\n "$J" |sed -E '
121     :X; $bY; N; bX; :Y;
122     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
123   ')"
124   J="$(json_load "${J}")"
125   
126   ingest "$J" "$content"\
127   | "${_EXEC}/concordance.sh" \
128     "$_DATA/$_INDEX/" "$(STRING "$_doc")        $_DATE"
129
130   J="${J#obj:}"
131   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
132
133   if   DBM "$_records" insert "$_doc" "$J"; then
134     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
135     result="created"
136   elif DBM "$_records" update "$_doc" "$J"; then
137     printf '%s: %s\r\n' "Status" "200 OK"
138     result="updated"
139   else
140     printf '%s\r\n' "Status: 500 Internal Server Error" ""
141     exit 0
142   fi
143
144   sed 's;$;\r;' <<-EOF
145         X-elastic-product: Elasticsearch
146         content-type: application/vnd.elasticsearch+json;compatible-with=8
147
148         { "_index": $(json_dump str:"${_INDEX}"),
149           "_id": $(json_dump str:"$_doc"),
150           "result": "$result",
151           "_indexdate": $_DATE
152         }
153         EOF
154   exit 0
155
156 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
157   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
158
159   if   DBM "$_records" get "$_doc"; then
160     if   DBM "$_records" delete "$_doc"; then
161       printf '%s: %s\r\n' "Status" "200 OK"
162       result="deleted"
163     else
164       printf '%s\r\n' "Status: 500 Internal Server Error" ""
165       exit 0
166     fi
167   else
168     printf '%s: %s\r\n' "Status" "404 Not Found"
169     result="not_found"
170   fi
171
172   sed 's;$;\r;' <<-EOF
173         X-elastic-product: Elasticsearch
174         content-type: application/vnd.elasticsearch+json;compatible-with=8
175
176         { "_index": $(json_dump str:"${_INDEX}"),
177           "_id": $(json_dump str:"$_doc"),
178           "result": "$result",
179           "_indexdate": $_DATE
180         }
181         EOF
182   exit 0
183
184 elif [ "$REQUEST_METHOD" = "POST" ]; then
185   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
186   J="$(json_get "$J" query.bool.must.bool.should)"
187
188   words="$(
189     for j in $(DB2 "$J" iterate @); do
190       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
191     done 2>/dev/null
192   )"
193
194   results="@    $(
195     search "${_DATA}/${_INDEX}" $words \
196     | while read -r score id source; do
197       S="$(DB2   "" set _index  str:"${_INDEX}")"
198       S="$(DB2 "$S" set _id     str:"$(UNSTRING "${id#/}")")"
199       S="$(DB2 "$S" set _score  num:"$score")"
200       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
201       printf 'obj:%s\t' "$(STRING "$S")"
202     done
203   )"
204   results="${results%   }"
205
206   sed 's;$;\r;' <<-EOF
207         Status: 200 OK
208         X-elastic-product: Elasticsearch
209         Content-Type: application/vnd.elasticsearch+json;compatible-with=8
210
211         { "took":0,
212           "timed_out":false,
213           "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
214           "hits": {
215             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
216             "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
217             "hits": $(json_dump "arr:$results")
218           }
219         }
220         EOF
221
222 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
223   accept="$(HEADER Accept)"
224   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
225   && ctype="${accept}" || ctype="application/json"
226
227   sed 's;$;\r;' <<-EOF
228         HTTP/1.1 200 OK
229         X-elastic-product: Elasticsearch
230         content-type: ${ctype}
231
232         EOF
233   exit 0
234
235 elif [ "$REQUEST_METHOD" = "GET" ]; then
236   accept="$(HEADER Accept)"
237   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
238   && ctype="${accept}" || ctype="application/json"
239
240   sed 's;$;\r;' <<-EOF
241         HTTP/1.1 200 OK
242         X-elastic-product: Elasticsearch
243         content-type: ${ctype}
244
245         EOF
246         
247   if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
248   sed 's;$;\r;' <<-EOF
249         { $(json_dump str:"${_INDEX}"): {
250             "aliases":{},
251             "mappings": {
252               "properties": {
253                 "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
254                 "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
255                 "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
256                 "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
257                 "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
258                 "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
259                 "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
260                 "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
261                 "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
262               }
263             },
264             "settings": {
265               "index": {
266                 "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
267                 "number_of_shards":"1",
268                 "provided_name": $(json_dump str:"${_INDEX}"),
269                 "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
270                 "number_of_replicas":"1",
271                 "uuid":"0000000000000000000000",
272                 "version":{"created":"8500010"}
273               }
274             }
275           }
276         }
277         EOF
278   else
279     sed 's;$;\r;' <<-EOF
280         { "name" : "head",
281           "cluster_name" : "elasticsearch",
282           "version" : {
283             "number" : "8.12.1",
284             "lucene_version" : "9.9.2",
285             "minimum_wire_compatibility_version" : "7.17.0",
286             "minimum_index_compatibility_version" : "7.0.0"
287           },
288           "tagline" : "You Know, for Search"
289         }
290         EOF
291   fi
292   exit 0
293
294 else
295   printf '%s\r\n' "Status: 500 Internal Server Error" ""
296   exit 0
297 fi