X-Git-Url: https://git.plutz.net/?a=blobdiff_plain;f=parsers%2F40_indexer.sh;h=ec7be4a7f4a6d9fa17da3b19ed762c814ce9b78b;hb=0befccc1274549229b77d50c1b8b24d6c4cf5403;hp=d5783f4ea9c8e79cb78dbf706ab75888ec0fd78f;hpb=feca7d3f5bff6cd8b1437adb95686a77ab8e9367;p=shellwiki diff --git a/parsers/40_indexer.sh b/parsers/40_indexer.sh index d5783f4..ec7be4a 100755 --- a/parsers/40_indexer.sh +++ b/parsers/40_indexer.sh @@ -14,10 +14,9 @@ # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR # IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -DOC="${PATH_INFO%/}/" DOC="${DOC%/\[*\]/}" DOC="${DOC%/}/" -P="$_DATA/pages${DOC}" I="$_DATA/index/" +DOC="${PATH_INFO%/}/" P="$_DATA/pages${DOC}" I="$_DATA/index/" -if [ -f "$P/#index.flag" -a ! "$P/#page.md" -nt "$P/#index.flag" ]; then +if [ -f "$P/#index.flag" -a ! "$P/#page.md" -nt "$P/#index.flag" ] || [ ! -d "$P" ]; then cat exit 0 fi @@ -26,7 +25,7 @@ fi exec 3>&1 -touch "$P/#index.flag" +touch -d "@$_DATE" "$P/#index.flag" mkdir -p "$I" { cat; printf \\n; } \ @@ -35,8 +34,11 @@ mkdir -p "$I" printf '%s\n' "$line" done \ | awk ' - BEGIN { FS = "[] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]+" } - { for (n = 1; n <= NF; n++) if ( $n != "" ) { words[tolower($n)]++; total++; } } + BEGIN { FS = "([] \t\n\r!\"#'\''()*+,./:;<=>?\\^_`{|}~[-]|%[0-9A-Fa-f]{2})+" } + { for (n = 1; n <= NF; n++) { + if ( $n != "" && length($n) <= 128 ) { + words[tolower($n)]++; total++; + } } } END { for (w in words) printf "%i %i %f %s\n", words[w], total, words[w] / total, w; } ' \ | while read num total freq word; do