]> git.plutz.net Git - rigidfind/blob - index.cgi
61f811e2a5b2b24623e6a7c34f87a7806e0c1820
[rigidfind] / index.cgi
1 #!/bin/sh
2
3 . "${_EXEC:-${0%/*}}/cgilite/cgilite.sh"
4 . "${_EXEC:-${0%/*}}/cgilite/storage.sh"
5 . "${_EXEC:-${0%/*}}/cgilite/json.sh"
6
7 [ "$_DATE" ] || _DATE="$(date +%s)"
8
9 ingest() {
10   local J="$1"
11
12   # json_get "$J" title
13   # json_get "$J" parts.comments
14
15   case $(json_get "$J" title) in
16     *.md|*.txt|*.csv)
17       printf %s "$content" |base64 -d
18       ;;
19     *.pdf)
20       printf %s "$content" |base64 -d \
21       | pdftotext - -
22       ;;
23     *.doc)
24       printf %s "$content" |base64 -d \
25       | catdoc /dev/stdin
26       ;;
27     *.xls)
28       printf %s "$content" |base64 -d \
29       | xls2csv /dev/stdin
30       ;;
31     *.ppt)
32       printf %s "$content" |base64 -d \
33       | catppt /dev/stdin
34       ;;
35     *.html|*.xml|*.svg)
36       printf %s "$content" |base64 -d \
37       | sed 's;<[^>]*>;;g'
38       ;;
39     *.docx)
40       printf %s "$content" |base64 -d \
41       | unzip -qc /dev/stdin word/document.xml \
42       | head -c 128M | sed 's;<[^>]*>;;g'
43       ;;
44     *.xlsx)
45       printf %s "$content" |base64 -d \
46       | unzip -qc /dev/stdin xl/sharedStrings.xml \
47       | head -c 128M | sed 's;<[^>]*>; ;g'
48       ;;
49     *.odt)
50       printf %s "$content" |base64 -d \
51       | unzip -qc /dev/stdin content.xml \
52       | head -c 128M | sed 's;<[^>]*>;;g'
53       ;;
54     *.ods|*.odp)
55       printf %s "$content" |base64 -d \
56       | unzip -qc /dev/stdin content.xml \
57       | head -c 128M | sed 's;<[^>]*>; ;g'
58       ;;
59     *):;;
60   esac
61 }
62
63 search() {
64   local index="$1" words w num total freq doc date J
65   shift 1; words="$@"
66
67   words="$(printf %s\\n "$words" | awk '
68     BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
69             if ( length("ยก") == 1 )  # Utf-8 aware AWK
70             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
71             else                     # UTF-8 Hack
72             FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
73             fi
74           }
75       { for (n = 1; n <= NF; n++) printf "%s  ", tolower($n); }
76   ')"
77
78   for w in ${words}; do
79     [ ! -f "${index}/$w" ] && continue
80   
81     while read num total freq doc date; do
82       printf '%s-%i  %f\n' "${doc}" "${date}" "$freq"
83     done <"${index}/$w"
84   done \
85   | awk '
86         { cnt[$1]++; weight[$1] = weight[$1] ? weight[$1] + $2 : $2; }
87     END { m = 0; for (d in cnt) m = ( m < cnt[d] ) ? cnt[d] : m;
88           for (d in cnt) if ( cnt[d] == m ) printf "%f    %s\n", weight[d], d;
89         }
90   ' \
91   | sort -nr \
92   | while read freq doc; do
93     date="${doc##*-}" doc="$(UNSTRING "${doc%-*}")"
94
95     if J="$(DBM "$_records" get "$doc")"; then
96       [ "$date" -eq "$(json_get obj:"$J" _indexdate)" ] \
97       && printf '%f     %s      %s\n' \
98          "$freq" "$(STRING "$doc")" "$(STRING "$J")"
99     fi
100   done
101 }
102
103 _INDEX="${PATH_INFO#/}" _INDEX="${_INDEX%%/*}"
104 _records="${_DATA}/${_INDEX}/_0_DOCS"
105
106 if   [ "$REQUEST_METHOD" = "PUT" ]; then
107   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
108
109   J="$(head -c "${CONTENT_LENGTH:-0}")"
110   # Don't use json parser to get content field
111   # Content can be very large and the json parser is slow
112   content="$(printf %s\\n "$J" |sed -E '
113     :X; $bY; N; bX; :Y;
114     s;^.*,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*";;
115     s;".*$;;
116     s;\\;;g;
117   ')"
118   J="$(printf %s\\n "$J" |sed -E '
119     :X; $bY; N; bX; :Y;
120     s;,[ \t\r\n]*"content"[ \t\r\n]*:[ \t\r\n]*"[^"]*";;
121   ')"
122   J="$(json_load "${J}")"
123   
124   ingest "$J" "$content"\
125   | "${_EXEC}/concordance.sh" \
126     "$_DATA/$_INDEX/" "$(STRING "$_doc")        $_DATE"
127
128   J="${J#obj:}"
129   J="$(DB2 "$J" set _indexdate num:"$_DATE")"
130
131   if   DBM "$_records" insert "$_doc" "$J"; then
132     printf '%s: %s\r\n' "Status" "201 Created" "Location" "/${_INDEX}/_doc/$(URL "$_doc")" \
133     result="created"
134   elif DBM "$_records" update "$_doc" "$J"; then
135     printf '%s: %s\r\n' "Status" "200 OK"
136     result="updated"
137   else
138     printf '%s\r\n' "Status: 500 Internal Server Error" ""
139     exit 0
140   fi
141
142   sed 's;$;\r;' <<-EOF
143         X-elastic-product: Elasticsearch
144         content-type: application/vnd.elasticsearch+json;compatible-with=8
145
146         { "_index": $(json_dump str:"${_INDEX}"),
147           "_id": $(json_dump str:"$_doc"),
148           "result": "$result",
149           "_indexdate": $_DATE
150         }
151         EOF
152   exit 0
153
154 elif [ "$REQUEST_METHOD" = "DELETE" ]; then
155   _doc="${PATH_INFO#"/${_INDEX}/_doc"}"
156
157   if   DBM "$_records" get "$_doc"; then
158     if   DBM "$_records" delete "$_doc"; then
159       printf '%s: %s\r\n' "Status" "200 OK"
160       result="deleted"
161     else
162       printf '%s\r\n' "Status: 500 Internal Server Error" ""
163       exit 0
164     fi
165   else
166     printf '%s: %s\r\n' "Status" "404 Not Found"
167     result="not_found"
168   fi
169
170   sed 's;$;\r;' <<-EOF
171         X-elastic-product: Elasticsearch
172         content-type: application/vnd.elasticsearch+json;compatible-with=8
173
174         { "_index": $(json_dump str:"${_INDEX}"),
175           "_id": $(json_dump str:"$_doc"),
176           "result": "$result",
177           "_indexdate": $_DATE
178         }
179         EOF
180   exit 0
181
182 elif [ "$REQUEST_METHOD" = "POST" ]; then
183   J="$(json_load "$(head -c "${CONTENT_LENGTH:-0}")")"
184   J="$(json_get "$J" query.bool.must.bool.should)"
185
186   words="$(
187     for j in $(DB2 "$J" iterate @); do
188       json_get "$(UNSTRING "$j")" match_phrase_prefix.content
189     done 2>/dev/null
190   )"
191
192   results="@    $(
193     search "${_DATA}/${_INDEX}" $words \
194     | while read -r score id source; do
195       S="$(DB2   "" set _id     str:"$(UNSTRING "${id#/}")")"
196       S="$(DB2 "$S" set _score  num:"$score")"
197       S="$(DB2 "$S" set _source obj:"$(UNSTRING "$source")")"
198       printf 'obj:%s\t' "$(STRING "$S")"
199     done
200   )"
201   results="${results%   }"
202
203   sed 's;$;\r;' <<-EOF
204         Status: 200 OK
205         X-elastic-product: Elasticsearch
206         Content-Type: application/vnd.elasticsearch+json;compatible-with=8
207
208         { "took":0,
209           "timed_out":false,
210           "_shards":{"total":1,"successful":1,"skipped":0,"failed":0},
211           "hits": {
212             "total":{"value": $(DB2 "$results" count @) ,"relation":"eq"},
213             "max_score": $(json_get "arr:$results" '[0]._score' 2>&- || printf 0),
214             "hits": $(json_dump "arr:$results")
215           }
216         }
217         EOF
218
219 elif [ "$REQUEST_METHOD" = "HEAD" ]; then
220   accept="$(HEADER Accept)"
221   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
222   && ctype="${accept}" || ctype="application/json"
223
224   sed 's;$;\r;' <<-EOF
225         HTTP/1.1 200 OK
226         X-elastic-product: Elasticsearch
227         content-type: ${ctype}
228
229         EOF
230   exit 0
231
232 elif [ "$REQUEST_METHOD" = "GET" ]; then
233   accept="$(HEADER Accept)"
234   [ ! "${accept#*"vnd.elasticsearch+json"*}" ] \
235   && ctype="${accept}" || ctype="application/json"
236
237   sed 's;$;\r;' <<-EOF
238         HTTP/1.1 200 OK
239         X-elastic-product: Elasticsearch
240         content-type: ${ctype}
241
242         EOF
243         
244   if [ "$PATH_INFO" = "/${_INDEX}/" ]; then
245   sed 's;$;\r;' <<-EOF
246         { $(json_dump str:"${_INDEX}"): {
247             "aliases":{},
248             "mappings": {
249               "properties": {
250                 "content": {"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
251                 "hash":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
252                 "metatags":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
253                 "owner":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
254                 "parts":{"properties":{"comments":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
255                 "provider":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
256                 "share_names":{"properties":{"paul":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}},
257                 "source":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
258                 "title":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
259               }
260             },
261             "settings": {
262               "index": {
263                 "routing":{"allocation":{"include":{"_tier_preference":"data_content"}}},
264                 "number_of_shards":"1",
265                 "provided_name": $(json_dump str:"${_INDEX}"),
266                 "creation_date": "$(stat -c %W "${_DATA}/${_INDEX}")",
267                 "number_of_replicas":"1",
268                 "uuid":"0000000000000000000000",
269                 "version":{"created":"8500010"}
270               }
271             }
272           }
273         }
274         EOF
275   else
276     sed 's;$;\r;' <<-EOF
277         { "name" : "head",
278           "cluster_name" : "elasticsearch",
279           "version" : {
280             "number" : "8.12.1",
281             "lucene_version" : "9.9.2",
282             "minimum_wire_compatibility_version" : "7.17.0",
283             "minimum_index_compatibility_version" : "7.0.0"
284           },
285           "tagline" : "You Know, for Search"
286         }
287         EOF
288   fi
289   exit 0
290
291 else
292   printf '%s\r\n' "Status: 500 Internal Server Error" ""
293   exit 0
294 fi