3 # Copyright 2023 - 2024 Paul Hänsch
5 # Permission to use, copy, modify, and/or distribute this software for any
6 # purpose with or without fee is hereby granted, provided that the above
7 # copyright notice and this permission notice appear in all copies.
9 # THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12 # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
15 # IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 # ============================================================================
18 # Read a document from STDIN and write a word index into DIR
22 { cat; printf \\n; } \
24 BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
25 if ( length("¡") == 1 ) # Utf-8 aware AWK
26 FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
28 FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
31 { for (n = 1; n <= NF; n++) {
32 if ( $n != "" && length($n) <= 128 ) {
33 words[tolower($n)]++; total++;
35 END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; }
37 | while read -r num total freq word; do
38 [ "$word" ] || continue
39 printf '%i %i %f %s\n' \
40 "$num" "$total" "$freq" "$DOC_ID" \