From 898d470f90e4055d0bcfe616bc009cca8d7f5692 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Paul=20H=C3=A4nsch?= <paul@plutz.net>
Date: Fri, 30 Aug 2024 21:59:59 +0200
Subject: [PATCH] optimized emphasis regex for performance in mawk

---
 markdown.awk | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/markdown.awk b/markdown.awk
index 6e3febe..6e1440c 100755
--- a/markdown.awk
+++ b/markdown.awk
@@ -307,6 +307,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) {
       ret = ret HTML(substr( line, 1, RLENGTH)); line = substr(line, RLENGTH + 1);
       continue;
 
+    # strong / em matchers use pre match pattern to make processing cheaper
     #  __strong__$
     } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__$") ) {
       len = RLENGTH;
@@ -320,7 +321,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) {
       continue;
 
     #  **strong**
-    } else if ( match(line, "^\\*\\*(([^\\*[:space:]]|" iea ")|([^\\*[:space:]]|" iea ")(" na "|" iea ")*([^\\*[:space:]]|" iea "))\\*\\*") ) {
+    } else if ( match(line, "^\\*\\*(([^*[:space:]]|" iea ")|([^*[:space:]]|" iea ")(" na "|" iea ")*([^*[:space:]]|" iea "))\\*\\*") ) {
       len = RLENGTH;
       ret = ret "<strong>" inline( substr( line, 3, len - 4 ) ) "</strong>"; line = substr( line, len + 1 );
       continue;
@@ -338,7 +339,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) {
       continue;
 
     #  *em*
-    } else if ( match(line, "^\\*(([^\\*[:space:]]|" isa ")|([^\\*[:space:]]|" isa ")(" na "|" isa ")*([^\\*[:space:]]|" isa "))\\*") ) {
+    } else if ( match(line, "^\\*(([^*[:space:]]|" isa ")|([^*[:space:]]|" isa ")(" na "|" isa ")*([^*[:space:]]|" isa "))\\*") ) {
       len = RLENGTH;
       ret = ret "<em>" inline( substr( line, 2, len - 2 ) ) "</em>"; line = substr( line, len + 1 );
       continue;
@@ -944,12 +945,12 @@ BEGIN {
   # hls = "0 0 0 0 0 0";
 
   # Universal Patterns
-  nu = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\_]|_[[:alnum:]])*"    # not underline (except when escaped)
-  na = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\\\*])*"  # not asterisk (except when escaped)
-  ieu =  "_([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])_"                 # inner <em> (underline)
-  isu = "__([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])__"                # inner <strong> (underline)
-  iea =    "\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*"     # inner <em> (asterisk)
-  isa = "\\*\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*\\*"  # inner <strong> (asterisk)
+  nu = "([^_\\\\]|\\\\.|_[[:alnum:]])"  # not underline (except when escaped, or inside a word)
+  na = "([^*\\\\]|\\\\.)"               # not asterisk (except when escaped)
+  ieu =  "_([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])_"                 # inner <em> (underline)
+  isu = "__([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])__"                # inner <strong> (underline)
+  iea =    "\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*"     # inner <em> (asterisk)
+  isa = "\\*\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*\\*"  # inner <strong> (asterisk)
 
   lix="\\[(\\\\[^\n]|[^]\n\\\\[])*\\]"  # link text
   lid="(<(\\\\[^\n]|[^\n<>\\\\])*>|(\\\\.|[^()\"'\\\\])+|([^<\n\t ()\\\\]|\\\\[^\n])(\\\\[\n]|[^\n\t \\(\\)\\\\])*)"  # link dest
-- 
2.39.5