X-Git-Url: https://git.plutz.net/?a=blobdiff_plain;f=parsers%2F40_indexer.sh;fp=parsers%2F40_indexer.sh;h=08060f1637ba6aa4b83a1d686fab601f09751ffd;hb=634d3aa5daf5404f2d209512acf70c601ece5697;hp=e3065c14aa11b8c5c82b357188f008ce48223e3a;hpb=22724fcc85fc43d40da86fedd6f11a7b32289f46;p=shellwiki diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index e3065c1..08060f1 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -36,9 +36,9 @@ done \ | awk ' BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F if ( length("¡") == 1 ) # Utf-8 aware AWK - FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; else # UTF-8 Hack - FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; fi } { for (n = 1; n <= NF; n++) { @@ -47,7 +47,7 @@ done \ } } } END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; } ' \ -| while read num total freq word; do +| while read -r num total freq word; do [ "$word" ] || continue F="$I/$word" L="$(STRING "$DOC")"