printf '%s\n' "$line"
done \
| awk '
- BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" }
+ BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F
+ if ( length("ยก") == 1 ) # Utf-8 aware AWK
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+";
+ else # UTF-8 Hack
+ FS = "([] \\t\\n\\r!\"#'\''()*+,./:;<=>?\\\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+";
+ fi
+ }
{ for (n = 1; n <= NF; n++) {
if ( $n != "" && length($n) <= 128 ) {
words[tolower($n)]++; total++;
} } }
END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; }
' \
-| while read num total freq word; do
+| while read -r num total freq word; do
[ "$word" ] || continue
- F="$I/$word"
- L="$(STRING "$DOC")"
-
- if LOCK "$F"; then
- touch "$F"
- { while read d l f n t; do
- [ "$l" = "$L" ] \
- || printf "%i %s %f %i %i\n" \
- "$d" "$l" "$f" "$n" "$t"
- done <"$F"
- printf "%i %s %f %i %i\n" \
- "$_DATE" "$L" "$freq" "$num" "$total"
- } >"$F.$$"
- mv -- "$F.$$" "$F"
- RELEASE "$F"
- fi
+ printf "%i %s %f %i %i\n" \
+ "$_DATE" "$(STRING "$DOC")" \
+ "$freq" "$num" "$total" \
+ >>"$I/$word"
done