X-Git-Url: https://git.plutz.net/?a=blobdiff_plain;f=parsers%2F40_indexer.sh;h=e3065c14aa11b8c5c82b357188f008ce48223e3a;hb=d9a3dee90a669c88bd6b4b0fa553e01ed1e729a1;hp=3cfc72c36d4ee61107100b433007bd252d4a9317;hpb=0054bedc4d290102bb839f81da9dc3a44e3bf679;p=shellwiki diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index 3cfc72c..e3065c1 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -25,7 +25,7 @@ fi exec 3>&1 -touch "$P/#index.flag" +touch -d "@$_DATE" "$P/#index.flag" mkdir -p "$I" { cat; printf \\n; } \ @@ -34,7 +34,13 @@ mkdir -p "$I" printf '%s\n' "$line" done \ | awk ' - BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" } + BEGIN { # Field separator FS should include punctuation, including Unicode Block U+2000 - U+206F + if ( length("¡") == 1 ) # Utf-8 aware AWK + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '[\342\200\200-\342\201\257]')"')+"; + else # UTF-8 Hack + FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2}|'"$(printf '\342\200[\200-\277]|\342\201[\201-\257]')"')+"; + fi + } { for (n = 1; n <= NF; n++) { if ( $n != "" && length($n) <= 128 ) { words[tolower($n)]++; total++;