From 898d470f90e4055d0bcfe616bc009cca8d7f5692 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 21:59:59 +0200 Subject: [PATCH] optimized emphasis regex for performance in mawk --- markdown.awk | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/markdown.awk b/markdown.awk index 6e3febe..6e1440c 100755 --- a/markdown.awk +++ b/markdown.awk @@ -307,6 +307,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { ret = ret HTML(substr( line, 1, RLENGTH)); line = substr(line, RLENGTH + 1); continue; + # strong / em matchers use pre match pattern to make processing cheaper # __strong__$ } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__$") ) { len = RLENGTH; @@ -320,7 +321,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { continue; # **strong** - } else if ( match(line, "^\\*\\*(([^\\*[:space:]]|" iea ")|([^\\*[:space:]]|" iea ")(" na "|" iea ")*([^\\*[:space:]]|" iea "))\\*\\*") ) { + } else if ( match(line, "^\\*\\*(([^*[:space:]]|" iea ")|([^*[:space:]]|" iea ")(" na "|" iea ")*([^*[:space:]]|" iea "))\\*\\*") ) { len = RLENGTH; ret = ret "" inline( substr( line, 3, len - 4 ) ) ""; line = substr( line, len + 1 ); continue; @@ -338,7 +339,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { continue; # *em* - } else if ( match(line, "^\\*(([^\\*[:space:]]|" isa ")|([^\\*[:space:]]|" isa ")(" na "|" isa ")*([^\\*[:space:]]|" isa "))\\*") ) { + } else if ( match(line, "^\\*(([^*[:space:]]|" isa ")|([^*[:space:]]|" isa ")(" na "|" isa ")*([^*[:space:]]|" isa "))\\*") ) { len = RLENGTH; ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); continue; @@ -944,12 +945,12 @@ BEGIN { # hls = "0 0 0 0 0 0"; # Universal Patterns - nu = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\_]|_[[:alnum:]])*" # not underline (except when escaped) - na = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\\\*])*" # not asterisk (except when escaped) - ieu = "_([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])_" # inner (underline) - isu = "__([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])__" # inner (underline) - iea = "\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*" # inner (asterisk) - isa = "\\*\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*\\*" # inner (asterisk) + nu = "([^_\\\\]|\\\\.|_[[:alnum:]])" # not underline (except when escaped, or inside a word) + na = "([^*\\\\]|\\\\.)" # not asterisk (except when escaped) + ieu = "_([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])_" # inner (underline) + isu = "__([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])__" # inner (underline) + iea = "\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*" # inner (asterisk) + isa = "\\*\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*\\*" # inner (asterisk) lix="\\[(\\\\[^\n]|[^]\n\\\\[])*\\]" # link text lid="(<(\\\\[^\n]|[^\n<>\\\\])*>|(\\\\.|[^()\"'\\\\])+|([^<\n\t ()\\\\]|\\\\[^\n])(\\\\[\n]|[^\n\t \\(\\)\\\\])*)" # link dest -- 2.39.2