From 61d13e28d97b124239b530b1e336395d90953793 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Thu, 29 Aug 2024 21:14:40 +0200 Subject: [PATCH 01/16] tests for references and macros --- tests-markdown.sh | 54 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/tests-markdown.sh b/tests-markdown.sh index 01fbda1..0ffc3d3 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -77,6 +77,8 @@ assert '[![Wikipedia](wikilogo.png)]()'\ '

Wikipedia

'\ "Image Link" +assert ' <" _foo_>>' '

macro /test -- "* weird <args>" _foo_

' "Macros" + # Block checks printf '\n## Testing Block markup ##\n' @@ -162,7 +164,7 @@ assert '![Testbild](Test Bild.jpg)' \ assert '![Testbild](Test Bild.jpg "German Television *test* image ca. 1994")' \ '
Testbild
German Television test image ca. 1994
' \ -"inline image" +"block image" assert '![Testbild *ARD*](Test Bild.jpg){tv ard function-check}' \ '
Testbild *ARD*
' \ @@ -306,7 +308,7 @@ assert ' # 'Pipe Tables' assert '+---+---+---+ -|Col 1| Col 2 | Col 3| +|Col 1\\| Col\|2 | Col 3| +===+:==:+===+ | * foo1 | *bar* |```| | * foo2 | **qua** |code | @@ -316,8 +318,8 @@ assert '+---+---+---+ +-------+-----+----+ ' \ ' - @@ -359,4 +361,46 @@ sub bar ' \ 'Headline Nesting' -printf '\nAll test passed!\n' +# Reference syntax checks +printf '\n## Testing reference syntax ##\n' + +assert 'Foo bar [Link] [1] for show + +The same in [en][] + +[en]: +[1]: http://de.wikipedia.org "Online Encyclopedia"' \ +'

Foo bar Link for show

+ +

The same in en

' \ +"Reference Links" + +assert 'Foo bar [Link] [1] for show + +[en]: +[1]: http://de.wikipedia.org + "Online Encyclopedia"' \ +'

Foo bar Link for show

' \ +"Reference Links" + +assert 'Foo bar ![Image] [1] for show + +The same as ![PNG][] + +[PNG]: +[1]: http://de.wikipedia.org/logo.jpg "Online Encyclopedia"' \ +'

Foo bar Image for show

+ +

The same as PNG

' \ +"Reference images" + +assert '![Image] [1] + +[PNG]: +[1]: http://de.wikipedia.org/logo.jpg "Online Encyclopedia"' \ +'
Image
Online Encyclopedia
' \ +"Reference images (block)" + +assert '<" _foo_>>' 'macro /test -- "* weird <args>" _foo_' "Macros/Block" + +printf '\nAll tests passed!\n' -- 2.39.5 From 7ba97e6646261c0a15e2f46093141c3c7a164775 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 00:52:53 +0200 Subject: [PATCH 02/16] include mawk tests --- tests-markdown.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests-markdown.sh b/tests-markdown.sh index 0ffc3d3..f67cfbf 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -15,7 +15,7 @@ assert() { local md comp="$2" msg="$3" printf "%3i: %s ... " $acnt "$msg" - for proc in gawk bawk; do + for proc in gawk bawk mawk; do printf '%s ' $proc md="$(printf '%s' "$1" |md_"$proc")" if [ "$md" != "$comp" ]; then @@ -52,9 +52,9 @@ assert '`_foo_-> bar`' '

_foo_-> bar

' 'arrow' assert ' <- comment' '

<!-- comment --> ← comment

' 'arrow' # Escaping -assert "©" "

©

" "escape" -assert "\©" "

&copy;

" "escape" -assert "AT&T" "

AT&T

" "escape" +assert '©' "

©

" "escape" +assert '\©' "

&copy;

" "escape" +assert 'AT&T' "

AT&T

" "escape" assert '`©`' "

&copy;

" "code span escape" # Automatic Links @@ -131,6 +131,18 @@ not be but &shy; <escaped>' \ "indented code block" +assert ' indented code will + not be + + *formatted* + but ­ ' \ +'
indented code will
+not be
+
+*formatted*
+but &shy; <escaped>
' \ +"indented code block" + assert ':::: tag fenced _divs_ are regular text @@ -204,7 +216,7 @@ assert '#### Heading four' \ ' \ 'Heading arbitrary' -assert '### Heading three ######' \ +assert '###Heading three ######' \ '

Heading three

' \ 'Heading arbitrary' @@ -351,7 +363,7 @@ sub bar ### sub sub sub ### -## sub2 bar {x} +##sub2 bar {x} ' \ '

foo

bar

-- 2.39.5 From 1653669a6c2a769cc541893182436fa3a603e9c4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 00:54:16 +0200 Subject: [PATCH 03/16] compatibility changes: no gensub function --- markdown.awk | 166 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 100 insertions(+), 66 deletions(-) diff --git a/markdown.awk b/markdown.awk index 7e20ebb..fcd42cf 100755 --- a/markdown.awk +++ b/markdown.awk @@ -154,9 +154,9 @@ function inline( line, LOCAL, len, text, code, href, guard ) { # Wiki style links } else if ( match( line, /^\[\[([^]|]+)(\|[^]]+)?\]\]/) ) { - len = RLENGTH; - href = gensub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", 1, substr(line, 1, len) ); - text = gensub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", 1, substr(line, 1, len) ); + len = RLENGTH; href = text = substr(line, 1, len); + sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", href ); + sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", text ); if ( ! text ) text = href; return "" HTML(text) "" inline( substr( line, len + 1) ); @@ -197,9 +197,11 @@ function inline( line, LOCAL, len, text, code, href, guard ) { # reference style links } else if ( match(line, /^\[([^]]+)\] ?\[([^]]*)\]/ ) ) { - len = RLENGTH; - text = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\1", 1, substr(line, 1, len) ); - id = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\2", 1, substr(line, 1, len) ); + len = RLENGTH; text = id = substr(line, 1, len); + sub(/\n.*$/, "", text); sub(/^\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); + sub(/\n.*$/, "", id); sub(/^\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); + # text = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\1", 1, text ); + # id = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\2", 1, id ); if ( ! id ) id = text; if ( rl_href[id] && rl_title[id] ) { return "" inline(text) "" inline( substr( line, len + 1) ); @@ -240,9 +242,11 @@ function inline( line, LOCAL, len, text, code, href, guard ) { # reference style images } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\]/ ) ) { - len = RLENGTH; - text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\1", 1, substr(line, 1, len) ); - id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\2", 1, substr(line, 1, len) ); + len = RLENGTH; text = id = substr(line, 1, len); + sub(/\n.*$/, "", text); sub(/^!\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); + sub(/\n.*$/, "", id); sub(/^!\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); + # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\1", 1, substr(line, 1, len) ); + # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\2", 1, substr(line, 1, len) ); if ( ! id ) id = text; if ( rl_href[id] && rl_title[id] ) { return "\""" \ @@ -304,7 +308,9 @@ function inline( line, LOCAL, len, text, code, href, guard ) { return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); # Literal HTML entities - } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { + # } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { + # mawk does not support repitition ranges + } else if ( match( line, /^&([a-zA-Z][a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?|#[0-9][0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?|#[xX][0-9a-fA-F][0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?);/) ) { len = RLENGTH; return substr( line, 1, len ) inline(substr(line, len + 1)); @@ -327,7 +333,8 @@ function inline( line, LOCAL, len, text, code, href, guard ) { } function headline( hlvl, htxt, attrib, LOCAL, sec, n, HL) { - match(hstack, /([0-9]+( [0-9]+){5})$/); split( substr(hstack, RSTART), HL); + # match(hstack, /([0-9]+( [0-9]+){5})$/); split( substr(hstack, RSTART), HL); + match(hstack, /([0-9]+( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+))$/); split( substr(hstack, RSTART), HL); for ( n = hlvl; n <= 6; n++ ) { sec = sec (HL[n]?"
":""); } HL[hlvl]++; for ( n = hlvl + 1; n <= 6; n++) { HL[n] = 0;} @@ -336,7 +343,8 @@ function headline( hlvl, htxt, attrib, LOCAL, sec, n, HL) { hid = hid HL[1]; for ( n = 2; n <= hlvl; n++) { hid = hid "." HL[n] ; } hid = hid ":" URL(htxt, 1); - sub(/([0-9]+( [0-9]+){5})$/, "", hstack); + # sub(/([0-9]+( [0-9]+){5})$/, "", hstack); + sub(/([0-9]+( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+))$/, "", hstack); hstack = hstack HL[1] " " HL[2] " " HL[3] " " HL[4] " " HL[5] " " HL[6]; return sec "
" \ @@ -413,15 +421,14 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, cols = 0; cnt=0; ttext = ""; # table header and alignment - split( gensub( /(^\||\|$)/, "", "g", \ - gensub( /(^|[^\\])\\\|/, "\\1\\|", "g", \ - substr(block, 1, match(block, /(\n|$)/)) \ - )), tarray, /\|/); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); + gsub( /(^\||\|$)/, "", tmp) + split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); - cols = split( \ - gensub( /(^\||\|$)/, "", "g", \ - substr(block, 1, match(block, /(\n|$)/)) \ - ), talign, /[+\|]/); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /(^\||\|$)/, "", tmp ); + cols = split( tmp , talign, /[+\|]/); block = substr(block, match(block, /(\n|$)/) + 1 ); for( cnt = 1; cnt < cols; cnt++ ) { @@ -437,10 +444,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, ttext = ttext "\n
\n" while ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ){ - split( gensub( /(^\||\|$)/, "", "g", \ - gensub( /(^|[^\\])\\\|/, "\\1\\|", "g", \ - substr(block, 1, match(block, /(\n|$)/)) \ - )), tarray, /\|/); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); + gsub( /(^\||\|$)/, "", tmp ); + split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); ttext = ttext "" @@ -469,11 +476,14 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, cols = 0; cnt=0; ttext = ""; # Column Count - cols = split( gensub( "^(\\+(:?-+:?\\+)+)(\n.*)*$", "\\1", 1, block), tread, /\+/) - 2; + tmp = block; sub( "(\n.*)*$", "", tmp); + cols = split( tmp, tread, /\+/) - 2; # debug(" Cols: " gensub( "^(\\+(:?-+:?\\+)+)(\n.*)*$", "\\1", 1, block )); # table alignment - split( gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block ), talign, /\+/ ); + match(block, "((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)"); + split( substr(block, RSTART, RLENGTH) , talign, /\+/ ); + # split( gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block ), talign, /\+/ ); # debug("Align: " gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block )); for (cnt = 1; cnt <= cols; cnt++) { @@ -492,10 +502,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, # table header block = substr(block, match(block, /(\n|$)/) + 1 ); while ( match(block, "^\\|([^\n]+\\|)+\n") ) { - split( gensub( /(^\||\|$)/, "", "g", \ - gensub( /(^|[^\\])\\\|/, "\\1\\|", "g", \ - substr(block, 1, match(block, /(\n|$)/)) \ - )), tread, /\|/); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); + gsub( /(^\||\|$)/, "", tmp ); + split(tmp, tread, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); for (cnt = 1; cnt <= cols; cnt++) tarray[cnt] = tarray[cnt] "\n" tread[cnt]; @@ -514,10 +524,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, while ( match(block, /^((\|([^\n]+\|)+\n)+\+(-+\+)+(\n|$))+/ ) ){ split("", tarray); while ( match(block, /^\|([^\n]+\|)+\n/) ) { - split( gensub( /(^\||\|$)/, "", "g", \ - gensub( /(^|[^\\])\\\|/, "\\1\\|", "g", \ - substr(block, 1, match(block, /(\n|$)/)) \ - )), tread, /\|/); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); + gsub( /(^\||\|$)/, "", tmp); + split( tmp, tread, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); for (cnt = 1; cnt <= cols; cnt++) tarray[cnt] = tarray[cnt] "\n" tread[cnt]; @@ -542,8 +552,9 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, return "
" text "
\n" _block( substr( block, len + 1) ); # Indented Code Block - } else if ( match(block, /^( |\t)( *\t*[^ \t\n]+ *\t*)+(\n|$)(( |\t)[^\n]+(\n|$)|[ \t]*(\n|$))*/) ) { + } else if ( match(block, /^(( |\t)[^\n]*[^\n\t ][^\n]*(\n|$))(( |\t)[^\n]*(\n|$)|[\t ]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; + code = substr(block, 1, len); gsub(/(^|\n)( |\t)/, "\n", code); gsub(/^\n|\n+$/, "", code); @@ -552,9 +563,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, # Fenced Divs (pandoc, custom) } else if ( match( block, /^(:::+)/ ) ) { - guard = substr( block, 1, RLENGTH ); - code = block; sub(/^[^\n]+\n/, "", code); - attrib = gensub(/^:::+[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\1", 1, block); + guard = substr( block, 1, RLENGTH ); attrib = code = block; + sub(/^[^\n]+\n/, "", code); + sub(/^:::+[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); + # attrib = gensub(/^:::+[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\1", 1, attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { @@ -574,9 +586,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, # Fenced Code Block (pandoc) } else if ( match( block, /^(~~~+|```+)/ ) ) { - guard = substr( block, 1, RLENGTH ); - code = gensub(/^[^\n]+\n/, "", 1, block); - attrib = gensub(/^(~~~+|```+)[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\2", 1, block); + guard = substr( block, 1, RLENGTH ); attrib = code = block; + sub(/^[^\n]+\n/, "", code); + sub(/^(~~~+|```+)[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); + # attrib = gensub(/^(~~~+|```+)[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\2", 1, attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { @@ -627,22 +640,26 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, return headline(2, text, 0) _block( substr( block, len + 1) ); # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib - } else if ( match( block, /^(#{1,6})[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n|$)/ ) ) { + } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n|$)/ ) ) { len = RLENGTH; text = attrib = substr(block, 1, len); - match(block, /^#{1,6}/); n = RLENGTH; + match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; - sub(/^(#{1,6})[ \t]*/, "", text); sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); - sub(/^(#{1,6})[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); + # sub(/^(##?#?#?#?#?)[ \t]*/, "", text); # not working in mawk + text = substr(text, n + 1); sub(/^[ \t]*/, "", text); + sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); + sub(/^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); sub(/\})(\n.*)?$/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); return headline( n, text, attrib ) _block( substr( block, len + 1) ); # Nth Order Heading H1 H2 H3 H4 H5 H6 - } else if ( match( block, /^(#{1,6})[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { + } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { len = RLENGTH; text = substr(block, 1, len); - match(block, /^#{1,6}/); n = RLENGTH; - sub(/^(#{1,6})[ \t]*/, "", text); sub(/[ \t]*#*(\n.*)?$/, "", text); + match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; + # sub(/^(##?#?#?#?#?)[ \t]+/, "", text); # not working in mawk + text = substr(text, n + 1); sub(/^[ \t]*/, "", text); + sub(/[ \t]*#*(\n.*)?$/, "", text); return headline( n, text, 0 ) _block( substr( block, len + 1) ); @@ -678,11 +695,12 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, "\n\n" \ _block( substr( block, len + 1) ); - # reference style images (block) - } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\](\n|$)/ ) ) { - len = RLENGTH; - text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\1", 1, block); - id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\2", 1, block); + } else if ( match(block, /^!\[([^]]*)\] ?\[([^]]*)\](\n|$)/ ) ) { + len = RLENGTH; text = id = block; + sub(/(\n.*)?$/, "", text); sub( /^!\[/, "", text); sub(/\] ?\[([^\n]*)\]$/, "", text); + sub(/(\n.*)?$/, "", id); sub( /^!\[([^\n]*)\] ?\[/, "", id); sub(/\]$/, "", id); + # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\1", 1, block); + # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\2", 1, block); if ( ! id ) id = text; if ( rl_href[id] && rl_title[id] ) { return "
" \ @@ -701,8 +719,9 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, # Macros (standalone <> calls handled as block, so they are not wrapped in paragraph) } else if ( match( block, /^<<(([^>]|>[^>])+)>>(\n|$)/ ) ) { - len = RLENGTH; - text = gensub(/^<<(([^>]|>[^>])+)>>(\n.*)?$/, "\\1", 1, block); + len = RLENGTH; text = block; + sub(/^<>(\n.*)?$/, "", text); + # text = gensub(/^<<(([^>]|>[^>])+)>>(\n.*)?$/, "\\1", 1, block); return "" HTML(text) "" _block(substr(block, len + 1) ); # Definition list @@ -743,7 +762,8 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, _block( substr(block, st + len) ); # Horizontal rule - } else if ( match( block, /(^|\n) ? ? ?((\* *){3,}|(- *){3,}|(_ *){3,})($|\n)/) ) { + # } else if ( match( block, /(^|\n) ? ? ?((\* *){3,}|(- *){3,}|(_ *){3,})($|\n)/) ) { + } else if ( match( block, /(^|\n) ? ? ?((\* *)(\* *)(\* *)(\* *)*|(- *)(- *)(- *)(- *)*|(_ *)(_ *)(_ *)(_ *)*)($|\n)/) ) { len = RLENGTH; st = RSTART; return _block(substr(block, 1, st - 1)) "
\n" _block(substr(block, st + len)); @@ -753,7 +773,7 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, } } -function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, text) { +function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, it, text) { if (match( block, "(^|\n) ? ? ?" mark "[ \t][^\n]+(\n|$)" \ "(([ \t]*\n)* ? ? ?" mark "[ \t][^\n]+(\n|$)" \ "|([ \t]*\n)*( ? ? ?\t| +)[^\n]+(\n|$)" \ @@ -761,7 +781,9 @@ function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, te st = RSTART; len = RLENGTH; list = substr( block, st, len); sub("^\n", "", list); match(list, "^ ? ? ?"); indent = RLENGTH; - gsub( "(^|\n) {0," indent "}", "\n", list); sub("^\n", "", list); + it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } + # gsub( "(^|\n) {0," indent "}", "\n", list); sub("^\n", "", list); + gsub( "(^|\n)" it, "\n", list); sub("^\n", "", list); text = substr(block, 1, st - 1); block = substr(block, st + len); if (match(text, /\n[[:space:]]*\n/)) return 0; @@ -775,10 +797,12 @@ function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, te } else return 0; } -function _list (block, mark, p, LOCAL, len, st, text, indent, task) { +function _list (block, mark, p, LOCAL, len, st, text, indent, it, task) { if ( match(block, "^([ \t]*\n)*$")) return; match(block, "^" mark "[ \t]"); indent = RLENGTH; + it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } + sub("^" mark "[ \t]", "", block); if (match(block, /\n[ \t]*\n/)) p = 1; @@ -787,7 +811,8 @@ function _list (block, mark, p, LOCAL, len, st, text, indent, task) { st = (RLENGTH == -1) ? length(block) + 1 : RSTART; text = substr(block, 1, st); block = substr(block, st + 1); - gsub("\n {0," indent "}", "\n", text); + # gsub("\n {0," indent "}", "\n", text); + gsub("\n" it, "\n", text); task = match( text, /^\[ \]/ ) ? "
  • " : \ match( text, /^\[-\]/ ) ? "
  • " : \ @@ -803,7 +828,7 @@ function _list (block, mark, p, LOCAL, len, st, text, indent, task) { return task text "
  • \n" _list(block, mark, p); } -function _dlist (block, LOCAL, len, st, text, indent, p) { +function _dlist (block, LOCAL, len, st, text, indent, it, p) { if (match( block, "^([ \t]*\n)*[^:\n \t][^\n]+\n" )) { len = RLENGTH; text = substr(block, 1, len); gsub( "(^\n*|\n*$)", "", text ); @@ -816,8 +841,10 @@ function _dlist (block, LOCAL, len, st, text, indent, p) { len = RLENGTH; text = substr(block, 1, len); sub( "^([ \t]*\n)*", "", text); match(text, "^ ? ? ?:(\t| +)"); indent = RLENGTH; + it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } sub( "^ ? ? ?:(\t| +)", "", text); - gsub( "(^|\n) {0," indent "}", "\n", text ); + # gsub( "(^|\n) {0," indent "}", "\n", text ); + gsub( "(^|\n)" it, "\n", text ); text = _nblock(text); if (match( text, "^

    (]|\n$" )) @@ -858,10 +885,17 @@ BEGIN { re_reflink = "(^|\n) ? ? ?\\[([^]\n]+)\\]: ([^ \t\n]+)(\n?[ \t]+(\"([^\"]+)\"|'([^']+)'|\\(([^)]+)\\)))?(\n|$)"; # /(^|\n) ? ? ?\[([^]\n]+)\]: ([^ \t\n]+)(\n?[ \t]+("([^"]+)"|'([^']+)'|\(([^)]+)\)))?(\n|$)/ while ( match(f, re_reflink ) ) { - rl_id = gensub( re_reflink, "\\2", 1, substr(f, RSTART, RLENGTH) ); - rl_href[rl_id] = gensub( re_reflink, "\\3", 1, substr(f, RSTART, RLENGTH) ); - rl_title[rl_id] = gensub( re_reflink, "\\5", 1, substr(f, RSTART, RLENGTH) ); - f = substr(f, RSTART + RLENGTH); + tt = th = ti = substr(f, RSTART, RLENGTH); f = substr(f, RSTART + RLENGTH); + sub("(^|\n) ? ? ?\\[", "", ti); sub("\\]: ([^ \t\n]+)(\n?[ \t]+(\"([^\"]+)\"|'([^']+)'|\\(([^)]+)\\)))?(\n.*)?$", "", ti); + sub("(^|\n) ? ? ?\\[([^]\n]+)\\]: ", "", th); sub("(\n?[ \t]+(\"([^\"]+)\"|'([^']+)'|\\(([^)]+)\\)))?(\n.*)?$", "", th); + if (match(tt, "(^|\n) ? ? ?\\[([^]\n]+)\\]: ([^ \t\n]+)(\n?[ \t]+(\"([^\"]+)\"|'([^']+)'|\\(([^)]+)\\)))(\n|$)")) { + sub("(^|\n) ? ? ?\\[([^]\n]+)\\]: ([^ \t\n]+)", "", tt); sub("^\n?[ \t]+", "", tt); sub("(\n.*)?$", "", tt); + } else { tt = ""; } + rl_id = ti; rl_href[rl_id] = th; rl_title[rl_id] = tt; + # rl_id = gensub( re_reflink, "\\2", 1, substr(f, RSTART, RLENGTH) ); + # rl_href[rl_id] = gensub( re_reflink, "\\3", 1, substr(f, RSTART, RLENGTH) ); + # rl_title[rl_id] = gensub( re_reflink, "\\5", 1, substr(f, RSTART, RLENGTH) ); + # f = substr(f, RSTART + RLENGTH); rl_title[rl_id] = substr( rl_title[rl_id], 2, length(rl_title[rl_id]) - 2 ); if ( rl_href[rl_id] ~ /<.*>/ ) rl_href[rl_id] = substr( rl_href[rl_id], 2, length(rl_href[rl_id]) - 2 ); } -- 2.39.5 From b329161df6fbf03345ccadce649151496693e18a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 13:09:11 +0200 Subject: [PATCH 04/16] bugfix Wiki Links, bugfix regex syntax --- markdown.awk | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/markdown.awk b/markdown.awk index fcd42cf..26fc11b 100755 --- a/markdown.awk +++ b/markdown.awk @@ -155,8 +155,10 @@ function inline( line, LOCAL, len, text, code, href, guard ) { # Wiki style links } else if ( match( line, /^\[\[([^]|]+)(\|[^]]+)?\]\]/) ) { len = RLENGTH; href = text = substr(line, 1, len); - sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", href ); - sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", text ); + sub(/^\[\[/, "", href); sub(/(\|([^]]+))?\]\].*$/, "", href); + sub(/^\[\[([^]|]+)/, "", text); sub(/\]\].*$/, "", text); sub(/^\|/, "", text); + # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", href ); + # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", text ); if ( ! text ) text = href; return "" HTML(text) "" inline( substr( line, len + 1) ); @@ -648,7 +650,7 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, text = substr(text, n + 1); sub(/^[ \t]*/, "", text); sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); sub(/^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); - sub(/\})(\n.*)?$/, "", attrib); + sub(/\}(\n.*)?$/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); return headline( n, text, attrib ) _block( substr( block, len + 1) ); -- 2.39.5 From 1744198c8b528ceb4f9d66ad076347d5a00858fd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 13:15:47 +0200 Subject: [PATCH 05/16] test wiki links and some full pages --- tests-markdown.sh | 803 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 800 insertions(+), 3 deletions(-) diff --git a/tests-markdown.sh b/tests-markdown.sh index f67cfbf..ff56c02 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -1,5 +1,7 @@ #!/bin/sh +runtimes="gawk bawk goawk" + BR=' ' CR="$(printf \r)" @@ -9,15 +11,20 @@ awk() { /bin/awk "$@"; } md_gawk() { gawk -f markdown.awk "$@"; } md_bawk() { busybox awk -f markdown.awk "$@"; } md_mawk() { mawk -f markdown.awk "$@"; } +md_goawk() { goawk -f markdown.awk "$@"; } acnt=1 # assertion count assert() { - local md comp="$2" msg="$3" + local md comp="$2" msg="$3" ex printf "%3i: %s ... " $acnt "$msg" - for proc in gawk bawk mawk; do + for proc in $runtimes; do printf '%s ' $proc - md="$(printf '%s' "$1" |md_"$proc")" + md="$(printf '%s' "$1" |md_"$proc")"; ex=$? + if [ "$ex" != 0 ]; then + printf "Fail!\nExit Code: %i\n" $ex + exit 1 + fi if [ "$md" != "$comp" ]; then printf "Fail!\n:\n%s\n:\n%s\n" "$md" "$comp" exit 1 @@ -57,6 +64,10 @@ assert '\©' "

    &copy;

    " "escape" assert 'AT&T' "

    AT&T

    " "escape" assert '`©`' "

    &copy;

    " "code span escape" +# Wiki Links +assert '[[Link/]]' '

    Link/

    ' "Wiki Link" +assert '[[Link/|Linked Page]]' '

    Linked Page

    ' "Wiki Link" + # Automatic Links assert '' "

    https://de.wikipedia.org

    " "automatic link" assert '' "

    http://de.wikipedia.org

    " "automatic link" @@ -415,4 +426,790 @@ assert '![Image] [1] assert '<" _foo_>>' 'macro /test -- "* weird <args>" _foo_' "Macros/Block" + +printf '\n## Testing example pages ##\n' + +assert 'Markdown.awk +============ + +Supported Features / TODO: +-------------------------- +- [x] done +- [ ] todo +- [-] not planned +- ? unsure (whether to implement) +- [/] partial + +### Basic Markdown - Block elements: ### +- [x] Paragraphs + - [x] Double space line breaks +- [x] Proper block element nesting +- [x] Headings +- [x] ATX-Style Headings +- [x] Blockquotes +- [x] Lists (ordered, unordered) +- [x] Code blocks (using indention) +- [x] Horizontal rules +- [x] Verbatim HTML block (disabled by default) + +### Basic Markdown - Inline elements: ### +- [x] Links +- [x] Reference style links +- [x] Emphasis *em*/**strong** (*Asterisk*, _Underscore_) +- [x] `code`, also ``code containing `backticks` `` +- [x] Images / reference style images +- [x] +- [x] backslash escapes +- [x] Verbatim HTML inline (disabled by default) +- [x] HTML escaping + +NOTE: Set the environment variable `MD_HTML=true` to enable verbatim HTML + +### Extensions - Block elements: ### +- [x] Automatic
    -wrapping (custom) +- ? Heading identifiers (php md, pandoc) + - [x] Heading attributes (custom) +- [x] Automatic heading identifiers (custom) +- [x] Fenced code blocks (php md, pandoc) + - [x] Fenced code attributes +- [x] Images (as block elements,
    -wrapped) (custom) + - [x] reference style block images +- [/] Tables + - ? Simple table (pandoc) + - ? Multiline table (pandoc) + - [x] Grid table (pandoc) + - [x] Headerless + - [x] Pipe table (php md, pandoc) +- [x] Line blocks (pandoc) +- [x] Task lists (pandoc, custom) +- [x] Definition lists (php md, pandoc) +- [-] Numbered example lists (pandoc) +- [-] Metadata blocks (pandoc) +- [x] Metadata blocks (custom) +- [x] Fenced Divs (pandoc) + +### Extensions - Inline elements: ### +- [x] Ignore embedded_underscores (php md, pandoc) +- [x] ~~strikeout~~ (pandoc) +- [x] ^Superscript^ ~Subscript~ (pandoc) +- [-] Bracketed spans (pandoc) + - [-] Inline attributes (pandoc) +- [x] Image attributes (custom, pandoc inspired, not for reference style) +- [x] Wiki style links [[PageName]] / [[PageName|Link Text]] +- [-] TEX-Math (pandoc) +- ? Footnotes (php md) +- ? Abbreviations (php md) +- ? "Curly quotes" (smartypants) +- [ ] em-dashes (--) (smartypants old) +- ? ... three-dot ellipsis (smartypants) +- [-] en-dash (smartypants) +- [ ] Automatic em-dash / en-dash +- [x] Automatic -> Arrows <- (custom) + +Compatibility +------------- +Markdown.awk can run in GNU awk (`gawk`) and in Busybox awk. It is _not_ fully POSIX compliant and does not run in `mawk` or `nawk`. In particular it makes heavy use of the `gensub()` function and its ability to use paranthesized subexpressions in the replacement text. This feature is not available in the POSIX specified `sub()` and `gsub()` functions. Hence it cannot be replaced without effort. + +Tests +----- +[Link with Title](https://en.wikipedia.org/wiki/Markdown "Markdown in Wikipedia"), *emphasis*, **strong**, **strong containing *emphasis***, `inline code`, `` code with `backticks` ``. See more tests [here](./tests/).' \ +'

    Markdown.awk

    +

    Supported Features / TODO:

    +
      +
    • done
    • +
    • todo
    • +
    • not planned
    • +
    • ? unsure (whether to implement)
    • +
    • partial
    • +
    +

    Basic Markdown - Block elements:

    +
      +
    • Paragraphs

      +
        +
      • Double space line breaks
      • +
      +
    • +
    • Proper block element nesting
    • +
    • Headings
    • +
    • ATX-Style Headings
    • +
    • Blockquotes
    • +
    • Lists (ordered, unordered)
    • +
    • Code blocks (using indention)
    • +
    • Horizontal rules
    • +
    • Verbatim HTML block (disabled by default)
    • +
    +

    Basic Markdown - Inline elements:

    +
      +
    • Links
    • +
    • Reference style links
    • +
    • Emphasis em/strong (Asterisk, Underscore)
    • +
    • code, also code containing `backticks`
    • +
    • Images / reference style images
    • +
    • <automatic links>
    • +
    • backslash escapes
    • +
    • Verbatim HTML inline (disabled by default)
    • +
    • HTML escaping
    • +
    +

    NOTE: Set the environment variable MD_HTML=true to enable verbatim HTML

    + +

    Extensions - Block elements:

    +
      +
    • Automatic <section>-wrapping (custom)
    • +
    • ? Heading identifiers (php md, pandoc)

      +
        +
      • Heading attributes (custom)
      • +
      +
    • +
    • Automatic heading identifiers (custom)
    • +
    • Fenced code blocks (php md, pandoc)

      +
        +
      • Fenced code attributes
      • +
      +
    • +
    • Images (as block elements, <figure>-wrapped) (custom)

      +
        +
      • reference style block images
      • +
      +
    • +
    • Tables

      +
        +
      • ? Simple table (pandoc)
      • +
      • ? Multiline table (pandoc)
      • +
      • Grid table (pandoc)

        +
          +
        • Headerless
        • +
        +
      • +
      • Pipe table (php md, pandoc)
      • +
      +
    • +
    • Line blocks (pandoc)
    • +
    • Task lists (pandoc, custom)
    • +
    • Definition lists (php md, pandoc)
    • +
    • Numbered example lists (pandoc)
    • +
    • Metadata blocks (pandoc)
    • +
    • Metadata blocks (custom)
    • +
    • Fenced Divs (pandoc)
    • +
    +

    Extensions - Inline elements:

    +
      +
    • Ignore embedded_underscores (php md, pandoc)
    • +
    • strikeout (pandoc)
    • +
    • Superscript Subscript (pandoc)
    • +
    • Bracketed spans (pandoc)

      +
        +
      • Inline attributes (pandoc)
      • +
      +
    • +
    • Image attributes (custom, pandoc inspired, not for reference style)
    • +
    • Wiki style links PageName / Link Text
    • +
    • TEX-Math (pandoc)
    • +
    • ? Footnotes (php md)
    • +
    • ? Abbreviations (php md)
    • +
    • ? "Curly quotes" (smartypants)
    • +
    • em-dashes (--) (smartypants old)
    • +
    • ? ... three-dot ellipsis (smartypants)
    • +
    • en-dash (smartypants)
    • +
    • Automatic em-dash / en-dash
    • +
    • Automatic → Arrows ← (custom)
    • +
    +

    Compatibility

    +

    Markdown.awk can run in GNU awk (gawk) and in Busybox awk. It is not fully POSIX compliant and does not run in mawk or nawk. In particular it makes heavy use of the gensub() function and its ability to use paranthesized subexpressions in the replacement text. This feature is not available in the POSIX specified sub() and gsub() functions. Hence it cannot be replaced without effort.

    + +

    Tests

    +

    Link with Title, emphasis, strong, strong containing emphasis, inline code, code with `backticks`. See more tests here.

    +
    ' \ +'Full Page (cgilite markdown)' + +assert 'Headline First Order +==================== + +Headline Second Order +--------------------- + + Code Block + with indentation + +> Blockquote +> ---------- +> like in an email + +### Headline 3rd order + +- unordered List +1. with sub points + + sometimes longer ones + +2. which are ordered +3. [ ] and have a Todo item +- more list points + - and a sublist +- [x] some of which ae done + +---------- ++ A lazy, lazy, list +item. + ++ Another one; this looks +bad but is legal. + + Second paragraph of second +list item. + +--------- + +~~~ {.blue} +Fenced Code Block +# with verbatim Text +`and an attribute` +~~~ + +| The limerick packs laughs anatomical +| In space that is quite economical. +| But the *good* ones I'\''ve seen +| So seldom are *clean* +| And the clean ones so seldom are comical + +| The Right Honorable Most Venerable and Righteous Samuel L. + Constable, Jr. +| 200 Main St. +| Berkeley, CA 94718 + +Term 1 + +: This is a definition with two paragraphs. Lorem ipsum + dolor sit amet, consectetuer adipiscing elit. Aliquam + hendrerit mi posuere lectus. + + Vestibulum enim wisi, viverra nec, fringilla in, laoreet + vitae, risus. + +: Second definition for term 1, also wrapped in a paragraph + because of the blank line preceding it. + +Term 2 + +: This definition has a code block, a blockquote and a list. + + code block. + + > block quote + > on two lines. + + 1. first list item + 2. second list item' \ +'

    Headline First Order

    +

    Headline Second Order

    +
    Code Block
    +with indentation
    +

    Blockquote

    +

    like in an email

    +
    + +

    Headline 3rd order

    +
      +
    • unordered List
    • +
    +
      +
    1. with sub points

      + +

      sometimes longer ones

      +
    2. +
    3. which are ordered

      +
    4. +
    5. and have a Todo item

      +
    6. +
    +
      +
    • more list points

      +
        +
      • and a sublist
      • +
      +
    • +
    • some of which ae done
    • +
    +
    +
      +
    • A lazy, lazy, list +item.

      +
    • +
    • Another one; this looks +bad but is legal.

      + +

      Second paragraph of second +list item.

      +
    • +
    +
    + +
    Fenced Code Block
    +# with verbatim Text
    +`and an attribute`
    +
    The limerick packs laughs anatomical
    +In space that is quite economical.
    + But the good ones I've seen
    + So seldom are clean
    +And the clean ones so seldom are comical
    +
    The Right Honorable Most Venerable and Righteous Samuel L. Constable, Jr.
    +200 Main St.
    +Berkeley, CA 94718
    +
    +
    Term 1
    +

    This is a definition with two paragraphs. Lorem ipsum +dolor sit amet, consectetuer adipiscing elit. Aliquam +hendrerit mi posuere lectus.

    + +

    Vestibulum enim wisi, viverra nec, fringilla in, laoreet +vitae, risus.

    +
    +
    Second definition for term 1, also wrapped in a paragraph +because of the blank line preceding it.
    +
    Term 2
    +

    This definition has a code block, a blockquote and a list.

    + +
    code block.
    +

    block quote +on two lines.

    + +
      +
    1. first list item
    2. +
    3. second list item
    4. +
    +
    +
    +
    ' \ +'Full Page (MD Tests)' + +assert '%css shellwiki.css + +Shellwiki +========= +Shellwiki is a Wiki and Content Management System with minimal dependencies. It can run on embedded devices, as well as full size web servers. Its goals are: + + - **easy deployment** + + *ShellWiki* can run on any Unix-Like web server. It requires no + scripting languages beyound the regular (Bourne style) Unix + shell, `awk`, and `sed`, all of which can be providede by + `busybox`. It can be launched via `netcat`, `inetd`, `systemd`, + or any cgi capable webserver like `apache` or `lighttpd`. + *ShellWiki* can run easily on embedded systems like OpenWRT or + RaspberryPi, and just as easily on internet web servers + providing multisite setups. + + - **accessibility** + + *ShellWiki* requires no browserside scripting. It aims to be rendered + in all web browsers including `w3m` and `links` besides graphical + browsers like `chromium` or `firefox`. It is as accessible on mobile + screens as on desktop computers. + *ShellWiki* uses the well known `markdown` syntax for formatting and + aims to provide consistent UI controls for various use cases. + + - **adaptability** + + *ShellWiki* is extensible through plugins and provides theming and + styling capabilities that make it suitable not only as a wiki, but + also as a CMS, including access scopes for different authors and + stylisticly distinct subpages. + + - **simplicity** + + *ShellWiki* avoids complexity in both software design and user + interface. It aims to be secure and predictable. Extensions can + be written and modified by system administrators. + +<> + +Features +-------- + - **Markdown Wiki Syntax** + + The wiki syntax is based on [John Grubers Markdown](https://daringfireball.net/projects/markdown/) + with extensions inspired by [Pandoc](https://pandoc.org/MANUAL.html#pandocs-markdown), + [PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/), and + [Github Flavored Markdown](https://github.github.com/gfm/). + Additional Macros are provided to enable functions like an automatic table of contents, listing of + sub pages, etc. + + See [Markdown](/software/cgilite/markdown/) + and [Macros](macros/) + + - **Plain file Storage** + + Pages and attachments are stored as plain files on disk. There is no need for a separate database + system. + + - **Git revisioning** + + If `git` is available on the web server, pages can be revisioned so that past versions can be + revisited. Optionally attachments can be revisioned too. Server administrators may use the git + archives to synchronise sites across servers by adding their own mechanics. + + - **Multisite Installation** + + Code and data directories are stricly separate on the server. Directory pathes are obtained from + environment variables, so that multiple sites can be served from the same installation directory. + + See also: [Installation](installation/) + + - **Semantic HTML5** + + for accessible rendering of pages + + - **Descriptive Page Names** + + URLs of pages can be freely provided by the user. User access can be constrained to specific sub + pages. Within their access permissions users can move and rename pages as they like. + + - **File Upload / Attachment** + + While pages are merely text documents themselves, users can upload additional attachments and + link to them in a page. Images and media files can be embedded directly into a page. + + - **Image scaling** + + If `ImageMagick` is available on the web server, huge attachment images are automatically compressed + and scaled to HD resolution when referred to in a page. + Of course the original version can still be linked to. + + - **Permissions via ACL** + + Grant read/write access for pages and sub-pages + + - **User provided CSS** + + Aside from full theming in the installation directory, pages can be styled using CSS files + uploaded as attachments. + + - **No reliance on Javascript** + + Authors and visitors can use the site without being forced to run untrusted code. + The main theme still provides collapsible menus and a responsive layout. + + - **Complete GDPR Compliance** without consent walls + + Because shellwiki does not track page visitors and does not + serve cookies to visitors by default it does not need to coerce + visitors into handling GDPR "consent" forms. + + (Login for authors still requires a session cookie) + + - **True multilanguage capability** + + - Pages can be translated + - Switching language does not require a cookie + - Fallback language for missing translations + - Users stay on a translated version, even if single page translations are missing + + - **Full text indexing and search** + + Shellwiki contains its own basic text indexer without external dependencies. + + - **Extensibility** through + + - [Themes](themes/) + - [Macros](macros/) + - [URL Handlers](handlers/) + - [Custom Syntax parsers](parsers/) + +Dependencies +------------ +Shellwiki is based on [cgilite](/software/cgilite/), which is included in the installation. It is written in posix compliant shell script, and the markdown renderer is written in ~~posix compliant~~ AWK. The entire wiki system can run with nothing more than a busybox. In fact it can be served from the rescue shell in a Debian initrd, or from an OpenWRT router. + +**Its precise requirements are:** + + - A Posix Shell (as provided by busybox, but bash is OK) + - An AWK interpreter (as provided by busybox, but GNU AWK is OK) + - `mawk` and `nawk` will currently not work + - inetd (as provided by busybox) + + **or** any CGI-Capable web server + + - _Optional:_ GIT for revisioning + - _Optional:_ ImageMagick for image compression + - _Optional:_ Sendmail for sending password reminders, etc. + +Installation +------------ +Also see -> [[installation/]] + +You can try out shellwiki right now using busybox: + + ~$ git clone https://git.plutz.net/git/shellwiki ~/shellwiki + ~$ _DATA=~/wikidata busybox nc -llp 1080 -e ~/shellwiki/index.cgi + +For additional examples, regarding permanent installation and configuration in webservers see [[installation/]]. + +Syntax +------ +The wiki syntax is based on John Grubers [Markdown](https://daringfireball.net/projects/markdown/) with extensions borrowed from [Pandoc](https://pandoc.org/MANUAL%202.html#pandocs-markdown) and [PHP Markdown Extra](https://michelf.ca/projects/php-markdown/extra/). The Markdown parser is provided by [Cgilite](/software/cgilite/) and its full documentation can be looked at [here](/software/cgilite/markdown/). + +<> + +Macros +------ +Also see -> [[macros/]] + +In addition to the Markdown syntax, wiki pages can include Macros, which perform additional functions on a page, like generating an image gallery, including parts of other pages, etc. Macros make Shellwiki truly dynamic and flexible. + +For example you can include a table of content for the current page by including the line + + <> + +in your page. Macros can receive additional parameters, which modify their behaviour. + +Macros are the most easy to write type of extension. See [Macros](macros/) for a full list of available macros. + +Themes +------ +Also see -> [[theming/]] + +While Shellwiki supports plugins for [theming](dev-theming/), it'\''s apearance can mostly be configured by the user. Pages can be configured to use custom CSS files. In addition page headers and footers are themselves wiki pages which can be modified to add menus, custom logos, links, etc. The same goes for error pages. + +For an example, see the [technical pages](/[wiki]/) for this wiki. + +Multiple Languages +------------------ +To enable a multilingual setup you must set a default language in your configuration environment: + +``` +export LANGUAGE_DEFAULT=en +``` + +Once this is the case, pagenames starting with a colon (`:`) will be considered translated versions of their parent pages. I.e. the pages `/`, `/:de`, and `/:fr` will serve as the default, german, and french home page respectively. + +The names of the languages can be arbitrary, but I recommend using [ISO-639](https://en.wikipedia.org/wiki/ISO_639-1) codes, because the code is used in the `lang=""` attribute of the pages top level html element. You can however make up non-standardised or fantastic language names as well. + +Links on each page will automatically be suffixed with the same language tag, so a visitor keeps browsing the same language without needing a cookie. Attachments should only be uploaded to the default language page, and attachment links in the translated pages will correctly point to the main page attachments. You can create a language menu on the header page, simply by linking to `./:en`, `./:es` , `./:fr`, etc. + +Header, footer, and error pages will be included from their respective language version, as will all macro includes, etc. Should a page not exist in a given language, the default page will be displayed instead. However, included elements will still be taken from the respective language version, possibly mixing languages between the selected user language and the default. + +### Constraints of the current implementation + - There can be only one default language, with no priority of different fallback languages + - Page URLs can currently not be translated. Doing so would require a model for manually assigning translated page names and would not be trivial to use. + +Developer Documentation +----------------------- +How to write: + + - [Themes](dev-theming/) + - [Macros](dev-macros/) + - [Handlers](dev-handlers/) + - [Parsers](dev-parsers/)' \ +'

    Shellwiki

    +

    Shellwiki is a Wiki and Content Management System with minimal dependencies. It can run on embedded devices, as well as full size web servers. Its goals are:

    +
      +
    • easy deployment

      + +

      ShellWiki can run on any Unix-Like web server. It requires no +scripting languages beyound the regular (Bourne style) Unix +shell, awk, and sed, all of which can be providede by +busybox. It can be launched via netcat, inetd, systemd, +or any cgi capable webserver like apache or lighttpd.
      +ShellWiki can run easily on embedded systems like OpenWRT or +RaspberryPi, and just as easily on internet web servers +providing multisite setups.

      +
    • +
    • accessibility

      + +

      ShellWiki requires no browserside scripting. It aims to be rendered +in all web browsers including w3m and links besides graphical +browsers like chromium or firefox. It is as accessible on mobile +screens as on desktop computers.
      +ShellWiki uses the well known markdown syntax for formatting and +aims to provide consistent UI controls for various use cases.

      +
    • +
    • adaptability

      + +

      ShellWiki is extensible through plugins and provides theming and +styling capabilities that make it suitable not only as a wiki, but +also as a CMS, including access scopes for different authors and +stylisticly distinct subpages.

      +
    • +
    • simplicity

      + +

      ShellWiki avoids complexity in both software design and user +interface. It aims to be secure and predictable. Extensions can +be written and modified by system administrators.

      +
    • +
    +toc 2 2

    Features

    +
      +
    • Markdown Wiki Syntax

      + +

      The wiki syntax is based on John Grubers Markdown +with extensions inspired by Pandoc, +PHP Markdown Extra, and +Github Flavored Markdown. +Additional Macros are provided to enable functions like an automatic table of contents, listing of +sub pages, etc.

      + +

      See Markdown
      +and Macros

      +
    • +
    • Plain file Storage

      + +

      Pages and attachments are stored as plain files on disk. There is no need for a separate database +system.

      +
    • +
    • Git revisioning

      + +

      If git is available on the web server, pages can be revisioned so that past versions can be +revisited. Optionally attachments can be revisioned too. Server administrators may use the git +archives to synchronise sites across servers by adding their own mechanics.

      +
    • +
    • Multisite Installation

      + +

      Code and data directories are stricly separate on the server. Directory pathes are obtained from +environment variables, so that multiple sites can be served from the same installation directory.

      + +

      See also: Installation

      +
    • +
    • Semantic HTML5

      + +

      for accessible rendering of pages

      +
    • +
    • Descriptive Page Names

      + +

      URLs of pages can be freely provided by the user. User access can be constrained to specific sub +pages. Within their access permissions users can move and rename pages as they like.

      +
    • +
    • File Upload / Attachment

      + +

      While pages are merely text documents themselves, users can upload additional attachments and +link to them in a page. Images and media files can be embedded directly into a page.

      +
    • +
    • Image scaling

      + +

      If ImageMagick is available on the web server, huge attachment images are automatically compressed +and scaled to HD resolution when referred to in a page. +Of course the original version can still be linked to.

      +
    • +
    • Permissions via ACL

      + +

      Grant read/write access for pages and sub-pages

      +
    • +
    • User provided CSS

      + +

      Aside from full theming in the installation directory, pages can be styled using CSS files +uploaded as attachments.

      +
    • +
    • No reliance on Javascript

      + +

      Authors and visitors can use the site without being forced to run untrusted code. +The main theme still provides collapsible menus and a responsive layout.

      +
    • +
    • Complete GDPR Compliance without consent walls

      + +

      Because shellwiki does not track page visitors and does not +serve cookies to visitors by default it does not need to coerce +visitors into handling GDPR "consent" forms.

      + +

      (Login for authors still requires a session cookie)

      +
    • +
    • True multilanguage capability

      +
        +
      • Pages can be translated
      • +
      • Switching language does not require a cookie
      • +
      • Fallback language for missing translations
      • +
      • Users stay on a translated version, even if single page translations are missing
      • +
      +
    • +
    • Full text indexing and search

      + +

      Shellwiki contains its own basic text indexer without external dependencies.

      +
    • +
    • Extensibility through

      + +
    • +
    +

    Dependencies

    +

    Shellwiki is based on cgilite, which is included in the installation. It is written in posix compliant shell script, and the markdown renderer is written in posix compliant AWK. The entire wiki system can run with nothing more than a busybox. In fact it can be served from the rescue shell in a Debian initrd, or from an OpenWRT router.

    + +

    Its precise requirements are:

    +
      +
    • A Posix Shell (as provided by busybox, but bash is OK)

      +
    • +
    • An AWK interpreter (as provided by busybox, but GNU AWK is OK)

      +
        +
      • mawk and nawk will currently not work
      • +
      +
    • +
    • inetd (as provided by busybox)

      + +

      or any CGI-Capable web server

      +
    • +
    • Optional: GIT for revisioning

      +
    • +
    • Optional: ImageMagick for image compression

      +
    • +
    • Optional: Sendmail for sending password reminders, etc.

      +
    • +
    +

    Installation

    +

    Also see → installation/

    + +

    You can try out shellwiki right now using busybox:

    + +
    ~$ git clone https://git.plutz.net/git/shellwiki ~/shellwiki
    +~$ _DATA=~/wikidata busybox nc -llp 1080 -e ~/shellwiki/index.cgi
    +

    For additional examples, regarding permanent installation and configuration in webservers see installation/.

    + +

    Syntax

    +

    The wiki syntax is based on John Grubers Markdown with extensions borrowed from Pandoc and PHP Markdown Extra. The Markdown parser is provided by Cgilite and its full documentation can be looked at here.

    + +include --nolink /[wiki]/editorhelp/

    Macros

    +

    Also see → macros/

    + +

    In addition to the Markdown syntax, wiki pages can include Macros, which perform additional functions on a page, like generating an image gallery, including parts of other pages, etc. Macros make Shellwiki truly dynamic and flexible.

    + +

    For example you can include a table of content for the current page by including the line

    + +
    <<toc>>
    +

    in your page. Macros can receive additional parameters, which modify their behaviour.

    + +

    Macros are the most easy to write type of extension. See Macros for a full list of available macros.

    + +

    Themes

    +

    Also see → theming/

    + +

    While Shellwiki supports plugins for theming, it's apearance can mostly be configured by the user. Pages can be configured to use custom CSS files. In addition page headers and footers are themselves wiki pages which can be modified to add menus, custom logos, links, etc. The same goes for error pages.

    + +

    For an example, see the technical pages for this wiki.

    + +

    Multiple Languages

    +

    To enable a multilingual setup you must set a default language in your configuration environment:

    + +
    export LANGUAGE_DEFAULT=en
    +

    Once this is the case, pagenames starting with a colon (:) will be considered translated versions of their parent pages. I.e. the pages /, /:de, and /:fr will serve as the default, german, and french home page respectively.

    + +

    The names of the languages can be arbitrary, but I recommend using ISO-639 codes, because the code is used in the lang="" attribute of the pages top level html element. You can however make up non-standardised or fantastic language names as well.

    + +

    Links on each page will automatically be suffixed with the same language tag, so a visitor keeps browsing the same language without needing a cookie. Attachments should only be uploaded to the default language page, and attachment links in the translated pages will correctly point to the main page attachments. You can create a language menu on the header page, simply by linking to ./:en, ./:es , ./:fr, etc.

    + +

    Header, footer, and error pages will be included from their respective language version, as will all macro includes, etc. Should a page not exist in a given language, the default page will be displayed instead. However, included elements will still be taken from the respective language version, possibly mixing languages between the selected user language and the default.

    + +

    Constraints of the current implementation

    +
      +
    • There can be only one default language, with no priority of different fallback languages
    • +
    • Page URLs can currently not be translated. Doing so would require a model for manually assigning translated page names and would not be trivial to use.
    • +
    +

    Developer Documentation

    +

    How to write:

    + +
    ' \ +'Full Page (ShellWiki)' + printf '\nAll tests passed!\n' -- 2.39.5 From c46fa28d9b531c7fe1ffa5d1bbce39a4b71f6597 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 14:38:22 +0200 Subject: [PATCH 06/16] avoid some recursion vor better compatibility and performance --- markdown.awk | 458 ++++++++++++++++++++++++++++----------------------- 1 file changed, 250 insertions(+), 208 deletions(-) diff --git a/markdown.awk b/markdown.awk index 26fc11b..d1c8b5b 100755 --- a/markdown.awk +++ b/markdown.awk @@ -121,217 +121,259 @@ function URL ( text, sharp ) { return text; } -function inline( line, LOCAL, len, text, code, href, guard ) { - if ( line ~ /^$/ ) { # Recursion End - return ""; - - # omit processing of escaped characters - } else if ( line ~ /^\\./) { - return HTML(substr(line, 2, 1)) inline( substr(line, 3) ); - - # hard brakes - } else if ( match(line, /^ \n/) ) { - return "
    \n" inline( substr(line, RLENGTH + 1) ); - - # ``code spans`` - } else if ( match( line, /^`+/) ) { - len = RLENGTH - guard = substr( line, 1, len ) - if ( match(line, guard ".*" guard) ) { - code = substr( line, len + 1, match( substr(line, len + 1), guard ) - 1) - len = 2 * length(guard) + length(code) - # strip single surrounding white spaces - gsub( /^ | $/, "", code) - # escape HTML within code span - gsub( /&/, "\\&", code ); gsub( //, "\\>", code ); - return "" code "" inline( substr( line, len + 1 ) ) - } - - # Macros - } else if ( match( line, /^<<([^>]|>[^>])+>>/ ) ) { - len = RLENGTH; - return "" HTML( substr( line, 3, len - 4 ) ) "" inline(substr(line, len + 1)); - - # Wiki style links - } else if ( match( line, /^\[\[([^]|]+)(\|[^]]+)?\]\]/) ) { - len = RLENGTH; href = text = substr(line, 1, len); - sub(/^\[\[/, "", href); sub(/(\|([^]]+))?\]\].*$/, "", href); - sub(/^\[\[([^]|]+)/, "", text); sub(/\]\].*$/, "", text); sub(/^\|/, "", text); - # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", href ); - # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", text ); - if ( ! text ) text = href; - return "" HTML(text) "" inline( substr( line, len + 1) ); - - # quick links ("automatic links" in md doc) - } else if ( match( line, /^<[a-zA-Z]+:\/\/([-\.[:alnum:]]+)(:[0-9]*)?(\/[^>]*)?>/ ) ) { - len = RLENGTH; - href = HTML( substr( line, 2, len - 2) ); - return "" href "" inline( substr( line, len + 1) ); - - # quick link email - } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>/ ) ) { - len = RLENGTH; - href = HTML( substr( line, 2, len - 2) ); - return "" href "" inline( substr( line, len + 1) ); - - # Verbatim inline HTML - } else if ( AllowHTML && match( line, /^(|<\?([^\?]|\?[^>])*\?>|]*>|])*\]\]>|<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)/) ) { - len = RLENGTH; - return substr( line, 1, len) inline(substr(line, len + 1)); - - # inline links - } else if ( match(line, "^" lii "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)") ) { - len = RLENGTH; - text = href = title = substr( line, 1, len); - sub("^\\[", "", text); sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)$", "", text); - sub("^" lii "\\([\n\t ]*", "", href); sub("([\n\t ]+" lit ")?[\n\t ]*\\)$", "", href); - sub("^" lii "\\([\n\t ]*" lid, "", title); sub("[\n\t ]*\\)$", "", title); sub("^[\n\t ]+", "", title); - - if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } - if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } - else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } - else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } - - gsub(/\\/, "", href); gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); - - return "" \ - inline( text ) "" inline( substr( line, len + 1) ); - - # reference style links - } else if ( match(line, /^\[([^]]+)\] ?\[([^]]*)\]/ ) ) { - len = RLENGTH; text = id = substr(line, 1, len); - sub(/\n.*$/, "", text); sub(/^\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); - sub(/\n.*$/, "", id); sub(/^\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); - # text = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\1", 1, text ); - # id = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\2", 1, id ); - if ( ! id ) id = text; - if ( rl_href[id] && rl_title[id] ) { - return "" inline(text) "" inline( substr( line, len + 1) ); - } else if ( rl_href[id] ) { - return "" inline(text) "" inline( substr( line, len + 1) ); - } else { - return "" HTML(substr(line, 1, len)) inline( substr(line, len + 1) ); - } - - # inline images - } else if ( match(line, "^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?") ) { - len = RLENGTH; text = href = title = attrib = substr( line, 1, len); - - sub("^!\\[", "", text); - sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", text); - - sub("^!" lix "\\([\n\t ]*", "", href); - sub("([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", href); - - sub("^!" lix "\\([\n\t ]*" lid, "", title); - sub("[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", title); - sub("^[\n\t ]+", "", title); +function inline( line, LOCAL, len, text, code, href, guard, ret ) { + ret = ""; + while (line !~ /^$/) { + # omit processing of escaped characters + if ( line ~ /^\\./) { + ret = ret HTML(substr(line, 2, 1)); line = substr(line, 3); + continue; + + # hard brakes + } else if ( match(line, /^ \n/) ) { + ret = ret "
    \n"; line = substr(line, RLENGTH + 1); + continue; + + # ``code spans`` + } else if ( match( line, /^`+/) ) { + len = RLENGTH + guard = substr( line, 1, len ) + if ( match(line, guard ".*" guard) ) { + code = substr( line, len + 1, match( substr(line, len + 1), guard ) - 1) + len = 2 * length(guard) + length(code) + # strip single surrounding white spaces + gsub( /^ | $/, "", code) + # escape HTML within code span + gsub( /&/, "\\&", code ); gsub( //, "\\>", code ); + ret = ret "" code ""; line = substr( line, len + 1 ); + continue; + } - sub("^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)", "", attrib); - sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); + # Macros + } else if ( match( line, /^<<([^>]|>[^>])+>>/ ) ) { + len = RLENGTH; + ret = ret "" HTML( substr( line, 3, len - 4 ) ) ""; line = substr(line, len + 1); + continue; + + # Wiki style links + } else if ( match( line, /^\[\[([^]|]+)(\|[^]]+)?\]\]/) ) { + len = RLENGTH; href = text = substr(line, 1, len); + sub(/^\[\[/, "", href); sub(/(\|([^]]+))?\]\].*$/, "", href); + sub(/^\[\[([^]|]+)/, "", text); sub(/\]\].*$/, "", text); sub(/^\|/, "", text); + # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\1", href ); + # sub(/^\[\[([^]|]+)(\|([^]]+))?\]\]/, "\\3", text ); + if ( ! text ) text = href; + ret = ret "" HTML(text) ""; line = substr( line, len + 1); + continue; + + # quick links ("automatic links" in md doc) + } else if ( match( line, /^<[a-zA-Z]+:\/\/([-\.[:alnum:]]+)(:[0-9]*)?(\/[^>]*)?>/ ) ) { + len = RLENGTH; + href = HTML( substr( line, 2, len - 2) ); + ret = ret "" href ""; line = substr( line, len + 1); + continue; + + # quick link email + # } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>/ ) ) { + } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9])?)*>/ ) ) { + len = RLENGTH; + href = HTML( substr( line, 2, len - 2) ); + ret = ret "" href ""; line = substr( line, len + 1); + continue; + + # Verbatim inline HTML + } else if ( AllowHTML && match( line, /^(|<\?([^\?]|\?[^>])*\?>|]*>|])*\]\]>|<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)/) ) { + len = RLENGTH; + ret = ret substr( line, 1, len); line =substr(line, len + 1); + continue; + + # inline links + } else if ( match(line, "^" lii "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)") ) { + len = RLENGTH; + text = href = title = substr( line, 1, len); + sub("^\\[", "", text); sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)$", "", text); + sub("^" lii "\\([\n\t ]*", "", href); sub("([\n\t ]+" lit ")?[\n\t ]*\\)$", "", href); + sub("^" lii "\\([\n\t ]*" lid, "", title); sub("[\n\t ]*\\)$", "", title); sub("^[\n\t ]+", "", title); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/\\/, "", href); gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); + + ret = ret "" \ + inline( text ) ""; + line = substr( line, len + 1); + continue; + + # reference style links + } else if ( match(line, /^\[([^]]+)\] ?\[([^]]*)\]/ ) ) { + len = RLENGTH; text = id = substr(line, 1, len); + sub(/\n.*$/, "", text); sub(/^\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); + sub(/\n.*$/, "", id); sub(/^\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); + # text = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\1", 1, text ); + # id = gensub(/^\[([^\n]+)\] ?\[([^\n]*)\].*/, "\\2", 1, id ); + if ( ! id ) id = text; + + if ( rl_href[id] && rl_title[id] ) { + ret = ret "" inline(text) ""; + line = substr( line, len + 1); + continue; + + } else if ( rl_href[id] ) { + ret = ret "" inline(text) ""; line = substr( line, len + 1); + continue; + + } else { + ret = ret "" HTML(substr(line, 1, len)); line = substr(line, len + 1); + continue; + } - if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } - if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } - else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } - else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + # inline images + } else if ( match(line, "^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?") ) { + len = RLENGTH; text = href = title = attrib = substr( line, 1, len); + + sub("^!\\[", "", text); + sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", text); + + sub("^!" lix "\\([\n\t ]*", "", href); + sub("([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", href); + + sub("^!" lix "\\([\n\t ]*" lid, "", title); + sub("[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?$", "", title); + sub("^[\n\t ]+", "", title); + + sub("^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)", "", attrib); + sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/^[\t ]+$/, "", text); gsub(/\\/, "", href); + gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); + + ret = ret "\"""; + line = substr( line, len + 1); + continue; + + # reference style images + } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\]/ ) ) { + len = RLENGTH; text = id = substr(line, 1, len); + sub(/\n.*$/, "", text); sub(/^!\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); + sub(/\n.*$/, "", id); sub(/^!\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); + # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\1", 1, substr(line, 1, len) ); + # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\2", 1, substr(line, 1, len) ); + if ( ! id ) id = text; + if ( rl_href[id] && rl_title[id] ) { + ret = ret "\"""; + line = substr( line, len + 1); + continue; + + } else if ( rl_href[id] ) { + ret = ret "\"""; + line = substr( line, len + 1); + continue; + + } else { + ret = ret "" HTML(substr(line, 1, len)); line = substr(line, len + 1); + continue; + } - gsub(/^[\t ]+$/, "", text); gsub(/\\/, "", href); - gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); - - return "\""" inline( substr( line, len + 1) ); - - # reference style images - } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\]/ ) ) { - len = RLENGTH; text = id = substr(line, 1, len); - sub(/\n.*$/, "", text); sub(/^!\[/, "", text); sub(/\] ?\[([^\n]*)\].*$/, "", text); - sub(/\n.*$/, "", id); sub(/^!\[([^]]+)\] ?\[/, "", id); sub(/\].*$/, "", id); - # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\1", 1, substr(line, 1, len) ); - # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\].*/, "\\2", 1, substr(line, 1, len) ); - if ( ! id ) id = text; - if ( rl_href[id] && rl_title[id] ) { - return "\""" \ - inline( substr( line, len + 1) ); - } else if ( rl_href[id] ) { - return "\""" \ - inline( substr( line, len + 1) ); + # ~~strikeout~~ (pandoc) + } else if ( match(line, /^~~([[:graph:]]|[[:graph:]]([^~]|~[^~])*[[:graph:]])~~/) ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 3, len - 4 ) ) ""; line = substr( line, len + 1 ); + continue; + + # ^superscript^ (pandoc) + } else if ( match(line, /^\^([^[:space:]^]|\\[ ^])+\^/) ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); + continue; + + # ~subscript~ (pandoc) + } else if ( match(line, /^~([^[:space:]~]|\\[ ~])+~/) ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); + continue; + + # ignore embedded underscores (pandoc, php md) + } else if ( match(line, "^[[:alnum:]](__|_)") ) { + ret = ret HTML(substr( line, 1, RLENGTH)); line = substr(line, RLENGTH + 1); + continue; + + # __strong__$ + } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__$") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 3, len - 4 ) ) ""; line = substr( line, len + 1 ); + continue; + + # __strong__ + } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__[[:space:][:punct:]]") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 3, len - 5 ) ) ""; line = substr( line, len); + continue; + + # **strong** + } else if ( match(line, "^\\*\\*(([^\\*[:space:]]|" iea ")|([^\\*[:space:]]|" iea ")(" na "|" iea ")*([^\\*[:space:]]|" iea "))\\*\\*") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 3, len - 4 ) ) ""; line = substr( line, len + 1 ); + continue; + + # _em_$ + } else if ( match(line, "^_(([^_[:space:]]|" isu ")|([^_[:space:]]|" isu ")(" nu "|" isu ")*([^_[:space:]]|" isu "))_$") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); + continue; + + # _em_ + } else if ( match(line, "^_(([^_[:space:]]|" isu ")|([^_[:space:]]|" isu ")(" nu "|" isu ")*([^_[:space:]]|" isu "))_[[:space:][:punct:]]") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 2, len - 3 ) ) ""; line = substr( line, len ); + continue; + + # *em* + } else if ( match(line, "^\\*(([^\\*[:space:]]|" isa ")|([^\\*[:space:]]|" isa ")(" na "|" isa ")*([^\\*[:space:]]|" isa "))\\*") ) { + len = RLENGTH; + ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); + continue; + + # Literal HTML entities + # } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { + # mawk does not support repitition ranges + } else if ( match( line, /^&([a-zA-Z][a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?|#[0-9][0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?|#[xX][0-9a-fA-F][0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?);/) ) { + len = RLENGTH; + ret = ret substr( line, 1, len ); line = substr(line, len + 1); + continue; + + # Arrows + } else if ( line ~ /^-->( |$)/) { # ignore multidash-arrow + ret = ret "-->"; line = substr(line, 4); + continue; + } else if ( line ~ /^<-( |$)/) { + ret = ret "←"; line = substr(line, 3); + continue; + } else if ( line ~ /^->( |$)/) { + ret = ret "→"; line = substr(line, 3); + continue; + + # Escape lone HTML character + } else if ( match( line, /^[&<>"']/) ) { + ret = ret HTML(substr(line, 1, 1)); line = substr(line, 2); + continue; + + # continue walk over string } else { - return "" HTML(substr(line, 1, len)) inline( substr(line, len + 1) ); + ret = ret substr(line, 1, 1); line = substr(line, 2); + continue; } - - # ~~strikeout~~ (pandoc) - } else if ( match(line, /^~~([[:graph:]]|[[:graph:]]([^~]|~[^~])*[[:graph:]])~~/) ) { - len = RLENGTH; - return "" inline( substr( line, 3, len - 4 ) ) "" inline( substr( line, len + 1 ) ); - - # ^superscript^ (pandoc) - } else if ( match(line, /^\^([^[:space:]^]|\\[ ^])+\^/) ) { - len = RLENGTH; - return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); - - # ~subscript~ (pandoc) - } else if ( match(line, /^~([^[:space:]~]|\\[ ~])+~/) ) { - len = RLENGTH; - return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); - - # ignore embedded underscores (pandoc, php md) - } else if ( match(line, "^[[:alnum:]](__|_)") ) { - return HTML(substr( line, 1, RLENGTH)) inline( substr(line, RLENGTH + 1) ); - - # __strong__$ - } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__$") ) { - len = RLENGTH; - return "" inline( substr( line, 3, len - 4 ) ) "" inline( substr( line, len + 1 ) ); - - # __strong__ - } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__[[:space:][:punct:]]") ) { - len = RLENGTH; - return "" inline( substr( line, 3, len - 5 ) ) "" inline( substr( line, len) ); - - # **strong** - } else if ( match(line, "^\\*\\*(([^\\*[:space:]]|" iea ")|([^\\*[:space:]]|" iea ")(" na "|" iea ")*([^\\*[:space:]]|" iea "))\\*\\*") ) { - len = RLENGTH; - return "" inline( substr( line, 3, len - 4 ) ) "" inline( substr( line, len + 1 ) ); - - # _em_$ - } else if ( match(line, "^_(([^_[:space:]]|" isu ")|([^_[:space:]]|" isu ")(" nu "|" isu ")*([^_[:space:]]|" isu "))_$") ) { - len = RLENGTH; - return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); - - # _em_ - } else if ( match(line, "^_(([^_[:space:]]|" isu ")|([^_[:space:]]|" isu ")(" nu "|" isu ")*([^_[:space:]]|" isu "))_[[:space:][:punct:]]") ) { - len = RLENGTH; - return "" inline( substr( line, 2, len - 3 ) ) "" inline( substr( line, len ) ); - - # *em* - } else if ( match(line, "^\\*(([^\\*[:space:]]|" isa ")|([^\\*[:space:]]|" isa ")(" na "|" isa ")*([^\\*[:space:]]|" isa "))\\*") ) { - len = RLENGTH; - return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); - - # Literal HTML entities - # } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { - # mawk does not support repitition ranges - } else if ( match( line, /^&([a-zA-Z][a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?|#[0-9][0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?|#[xX][0-9a-fA-F][0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?);/) ) { - len = RLENGTH; - return substr( line, 1, len ) inline(substr(line, len + 1)); - - # Arrows - } else if ( line ~ /^-->( |$)/) { # ignore multidash-arrow - return "-->" inline( substr(line, 4) ); - } else if ( line ~ /^<-( |$)/) { - return "←" inline( substr(line, 3) ); - } else if ( line ~ /^->( |$)/) { - return "→" inline( substr(line, 3) ); - - # Escape lone HTML character - } else if ( match( line, /^[&<>"']/) ) { - return HTML(substr(line, 1, 1)) inline(substr(line, 2)); - - # continue walk over string - } else { - return substr(line, 1, 1) inline( substr(line, 2) ); } + return ret; } function headline( hlvl, htxt, attrib, LOCAL, sec, n, HL) { @@ -364,14 +406,14 @@ function _nblock( block, LOCAL, sec, n ) { for ( n = blvl + 1; n in BL; n++) { delete BL[n]; } block = _block( block ); - match(hstack, /([0-9]+( [0-9]+){5})$/); split( substr(hstack, RSTART), HL); + match(hstack, /([0-9]+( [0-9]+)( [0-9]+)?( [0-9]+)?( [0-9]+)?( [0-9]+)?)$/); split( substr(hstack, RSTART), HL); sec = ""; for ( n = 1; n <= 6; n++ ) { sec = sec (HL[n]?"
    ":""); } - sub("( +[0-9]+){6} *$", "", hstack); blvl--; + sub("( +[0-9]+)( +[0-9]+)?( +[0-9]+)?( +[0-9]+)?( +[0-9]+)?( +[0-9]+)? *$", "", hstack); blvl--; return block sec; } -function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, indent, list ) { +function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, indent, list, tmp ) { gsub( "(^\n+|\n+$)", "", block ); if ( block == "" ) { -- 2.39.5 From b6f82bc119dd614862f61df3b1978cc5789ab31a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 16:30:00 +0200 Subject: [PATCH 07/16] avoid recursion in _block function to increase compatibility --- markdown.awk | 756 ++++++++++++++++++++++++++------------------------- 1 file changed, 393 insertions(+), 363 deletions(-) diff --git a/markdown.awk b/markdown.awk index d1c8b5b..fc2f203 100755 --- a/markdown.awk +++ b/markdown.awk @@ -413,408 +413,438 @@ function _nblock( block, LOCAL, sec, n ) { return block sec; } -function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, indent, list, tmp ) { - gsub( "(^\n+|\n+$)", "", block ); - - if ( block == "" ) { - return ""; - - # HTML #2 #3 #4 $5 - } else if ( AllowHTML && match( block, /(^|\n) ? ? ?(|$)|<\?([^\?]|\?[^>])*(\?>|$)|]*(>|$)|])*(\]\]>|$))/) ) { - len = RLENGTH; st = RSTART; - return _block(substr(block, 1, st - 1)) substr(block, st, len) _block(substr(block, st + len)); - - # HTML #6 - } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { - len = RLENGTH; st = RSTART; - return _block(substr(block, 1, st - 1)) substr(block, st, len) _block(substr(block, st + len)); - - # HTML #1 - } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<(script|pre|style)([[:space:]\n>]).*(<\/script>|<\/pre>|<\/style>|$)/) ) { - len = RLENGTH; st = RSTART; - match( tolower(substr(block, st, len)), /(<\/script>|<\/pre>|<\/style>)/); - len = RSTART + RLENGTH; - return _block(substr(block, 1, st - 1)) substr(block, st, len) _block(substr(block, st + len)); - - # HTML #7 - } else if ( AllowHTML && match( block, /^ ? ? ?(<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)([[:space:]]*\n)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { - len = RLENGTH; st = RSTART; - return substr(block, st, len) _block(substr(block, st + len)); - - # Metadata (custom, block starting with %something) - # Metadata is ignored but can be interpreted externally - } else if ( match(block, /^%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)(%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)|%([[:space:]][^\n]*)?(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { - len = RLENGTH; st = RSTART; - return _block( substr( block, len + 1) ); +function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, indent, list, tmp, ret) { + ret = ""; + while ( block != "" ) { + gsub( "(^\n+|\n+$)", "", block ); + + # HTML #2 #3 #4 $5 + if ( AllowHTML && match( block, /(^|\n) ? ? ?(|$)|<\?([^\?]|\?[^>])*(\?>|$)|]*(>|$)|])*(\]\]>|$))/) ) { + len = RLENGTH; st = RSTART; + ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); + continue; + + # HTML #6 + } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { + len = RLENGTH; st = RSTART; + ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); + continue; + + # HTML #1 + } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<(script|pre|style)([[:space:]\n>]).*(<\/script>|<\/pre>|<\/style>|$)/) ) { + len = RLENGTH; st = RSTART; + match( tolower(substr(block, st, len)), /(<\/script>|<\/pre>|<\/style>)/); + len = RSTART + RLENGTH; + ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); + continue; + + # HTML #7 + } else if ( AllowHTML && match( block, /^ ? ? ?(<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)([[:space:]]*\n)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { + len = RLENGTH; st = RSTART; + ret = ret substr(block, st, len); block = substr(block, st + len); + continue; + + # Metadata (custom, block starting with %something) + # Metadata is ignored but can be interpreted externally + } else if ( match(block, /^%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)(%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)|%([[:space:]][^\n]*)?(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { + len = RLENGTH; st = RSTART; + block = substr( block, len + 1); + continue; - # Blockquote (leading >) - } else if ( match( block, /^> /) ) { - match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match(block, /$/); - len = RLENGTH; st = RSTART; - text = substr(block, 1, st - 1); gsub( /(^|\n)> /, "\n", text ); - text = _nblock( text ); gsub( /^\n|\n$/, "", text ) - return "
    " text "
    \n\n" _block( substr(block, st + len) ); - - # Pipe Tables (pandoc / php md / gfm ) - } else if ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?)\n" \ - "((\\|)?(:?-+:?[\\|+])+:?-+:?(\\|)?)\n" \ - "((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ) { - len = RLENGTH; st = RSTART; - #initialize empty arrays - split("", talign); split("", tarray); - cols = 0; cnt=0; ttext = ""; - - # table header and alignment - tmp = substr(block, 1, match(block, /(\n|$)/)); - gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); - gsub( /(^\||\|$)/, "", tmp) - split( tmp, tarray, /\|/); - block = substr(block, match(block, /(\n|$)/) + 1 ); - tmp = substr(block, 1, match(block, /(\n|$)/)); - gsub( /(^\||\|$)/, "", tmp ); - cols = split( tmp , talign, /[+\|]/); - block = substr(block, match(block, /(\n|$)/) + 1 ); - - for( cnt = 1; cnt < cols; cnt++ ) { - if (match(talign[cnt], /:-+:/)) talign[cnt]="center"; - else if (match(talign[cnt], /-+:/)) talign[cnt]="right"; - else if (match(talign[cnt], /:-+/)) talign[cnt]="left"; - else talign[cnt]=""; - } + # Blockquote (leading >) + } else if ( match( block, /^> /) ) { + match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match(block, /$/); + len = RLENGTH; st = RSTART; + text = substr(block, 1, st - 1); gsub( /(^|\n)> /, "\n", text ); + text = _nblock( text ); gsub( /^\n|\n$/, "", text ) + ret = ret "
    " text "
    \n\n"; block = substr(block, st + len); + continue; - ttext = "
    \n" - for (cnt = 1; cnt < cols; cnt++) - ttext = ttext "" - ttext = ttext "\n\n" + # Pipe Tables (pandoc / php md / gfm ) + } else if ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?)\n" \ + "((\\|)?(:?-+:?[\\|+])+:?-+:?(\\|)?)\n" \ + "((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ) { + len = RLENGTH; st = RSTART; + #initialize empty arrays + split("", talign); split("", tarray); + cols = 0; cnt=0; ttext = ""; - while ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ){ + # table header and alignment tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); - gsub( /(^\||\|$)/, "", tmp ); + gsub( /(^\||\|$)/, "", tmp) split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /(^\||\|$)/, "", tmp ); + cols = split( tmp , talign, /[+\|]/); + block = substr(block, match(block, /(\n|$)/) + 1 ); + + for( cnt = 1; cnt < cols; cnt++ ) { + if (match(talign[cnt], /:-+:/)) talign[cnt]="center"; + else if (match(talign[cnt], /-+:/)) talign[cnt]="right"; + else if (match(talign[cnt], /:-+/)) talign[cnt]="left"; + else talign[cnt]=""; + } - ttext = ttext "" + ttext = "\n" for (cnt = 1; cnt < cols; cnt++) - ttext = ttext "" - ttext = ttext "\n" - } - return "

    Col 1

    -

    Col 2

    +

    Col 1\

    +

    Col|2

    Col 3

    " inline(tarray[cnt]) "
    " inline(tarray[cnt]) "
    " ttext "
    \n" _block(block); - - # Grid Tables (pandoc) - # (with, and without header) - } else if ( match( block, "^\\+(-+\\+)+\n" \ - "(\\|([^\n]+\\|)+\n)+" \ - "(\\+(:?=+:?\\+)+)\n" \ - "((\\|([^\n]+\\|)+\n)+" \ - "\\+(-+\\+)+(\n|$))+" \ - ) || \ - match( block, "^()()()" \ - "(\\+(:?-+:?\\+)+)\n" \ - "((\\|([^\n]+\\|)+\n)+" \ - "\\+(-+\\+)+(\n|$))+" \ - ) ) { - len = RLENGTH; st = RSTART; - #initialize empty arrays - split("", talign); split("", tarray); split("", tread); - cols = 0; cnt=0; ttext = ""; - - # Column Count - tmp = block; sub( "(\n.*)*$", "", tmp); - cols = split( tmp, tread, /\+/) - 2; - # debug(" Cols: " gensub( "^(\\+(:?-+:?\\+)+)(\n.*)*$", "\\1", 1, block )); - - # table alignment - match(block, "((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)"); - split( substr(block, RSTART, RLENGTH) , talign, /\+/ ); - # split( gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block ), talign, /\+/ ); - # debug("Align: " gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block )); - - for (cnt = 1; cnt <= cols; cnt++) { - if (match(talign[cnt], /:(-+|=+):/)) talign[cnt]="center"; - else if (match(talign[cnt], /(-+|=+):/)) talign[cnt]="right"; - else if (match(talign[cnt], /:(-+|=+)/ )) talign[cnt]="left"; - else talign[cnt]=""; - } + ttext = ttext "" inline(tarray[cnt]) "" + ttext = ttext "\n\n" - if ( match(block, "^\\+(-+\\+)+\n" \ - "(\\|([^\n]+\\|)+\n)+" \ - "\\+(:?=+:?\\+)+\n" \ - "((\\|([^\n]+\\|)+\n)+" \ - "\\+(-+\\+)+(\n|$))+" \ - ) ) { - # table header - block = substr(block, match(block, /(\n|$)/) + 1 ); - while ( match(block, "^\\|([^\n]+\\|)+\n") ) { + while ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ){ tmp = substr(block, 1, match(block, /(\n|$)/)); - gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); + gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); gsub( /(^\||\|$)/, "", tmp ); - split(tmp, tread, /\|/); + split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); - for (cnt = 1; cnt <= cols; cnt++) - tarray[cnt] = tarray[cnt] "\n" tread[cnt]; - } - ttext = "\n" - for (cnt = 1; cnt <= cols; cnt++) - ttext = ttext "" _nblock(tarray[cnt]) "" - ttext = ttext "\n" - } + ttext = ttext "" + for (cnt = 1; cnt < cols; cnt++) + ttext = ttext "" inline(tarray[cnt]) "" + ttext = ttext "\n" + } + ret = ret "" ttext "
    \n"; + continue; - # table body - block = substr(block, match(block, /(\n|$)/) + 1 ); - ttext = ttext "\n" + # Grid Tables (pandoc) + # (with, and without header) + } else if ( match( block, "^\\+(-+\\+)+\n" \ + "(\\|([^\n]+\\|)+\n)+" \ + "(\\+(:?=+:?\\+)+)\n" \ + "((\\|([^\n]+\\|)+\n)+" \ + "\\+(-+\\+)+(\n|$))+" \ + ) || \ + match( block, "^()()()" \ + "(\\+(:?-+:?\\+)+)\n" \ + "((\\|([^\n]+\\|)+\n)+" \ + "\\+(-+\\+)+(\n|$))+" \ + ) ) { + len = RLENGTH; st = RSTART; + #initialize empty arrays + split("", talign); split("", tarray); split("", tread); + cols = 0; cnt=0; ttext = ""; + + # Column Count + tmp = block; sub( "(\n.*)*$", "", tmp); + cols = split( tmp, tread, /\+/) - 2; + # debug(" Cols: " gensub( "^(\\+(:?-+:?\\+)+)(\n.*)*$", "\\1", 1, block )); + + # table alignment + match(block, "((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)"); + split( substr(block, RSTART, RLENGTH) , talign, /\+/ ); + # split( gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block ), talign, /\+/ ); + # debug("Align: " gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block )); + + for (cnt = 1; cnt <= cols; cnt++) { + if (match(talign[cnt], /:(-+|=+):/)) talign[cnt]="center"; + else if (match(talign[cnt], /(-+|=+):/)) talign[cnt]="right"; + else if (match(talign[cnt], /:(-+|=+)/ )) talign[cnt]="left"; + else talign[cnt]=""; + } - while ( match(block, /^((\|([^\n]+\|)+\n)+\+(-+\+)+(\n|$))+/ ) ){ - split("", tarray); - while ( match(block, /^\|([^\n]+\|)+\n/) ) { - tmp = substr(block, 1, match(block, /(\n|$)/)); - gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); - gsub( /(^\||\|$)/, "", tmp); - split( tmp, tread, /\|/); + if ( match(block, "^\\+(-+\\+)+\n" \ + "(\\|([^\n]+\\|)+\n)+" \ + "\\+(:?=+:?\\+)+\n" \ + "((\\|([^\n]+\\|)+\n)+" \ + "\\+(-+\\+)+(\n|$))+" \ + ) ) { + # table header block = substr(block, match(block, /(\n|$)/) + 1 ); + while ( match(block, "^\\|([^\n]+\\|)+\n") ) { + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); + gsub( /(^\||\|$)/, "", tmp ); + split(tmp, tread, /\|/); + block = substr(block, match(block, /(\n|$)/) + 1 ); + for (cnt = 1; cnt <= cols; cnt++) + tarray[cnt] = tarray[cnt] "\n" tread[cnt]; + } + + ttext = "\n" for (cnt = 1; cnt <= cols; cnt++) - tarray[cnt] = tarray[cnt] "\n" tread[cnt]; + ttext = ttext "" _nblock(tarray[cnt]) "" + ttext = ttext "\n" } + + # table body block = substr(block, match(block, /(\n|$)/) + 1 ); + ttext = ttext "\n" + + while ( match(block, /^((\|([^\n]+\|)+\n)+\+(-+\+)+(\n|$))+/ ) ){ + split("", tarray); + while ( match(block, /^\|([^\n]+\|)+\n/) ) { + tmp = substr(block, 1, match(block, /(\n|$)/)); + gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); + gsub( /(^\||\|$)/, "", tmp); + split( tmp, tread, /\|/); + block = substr(block, match(block, /(\n|$)/) + 1 ); + for (cnt = 1; cnt <= cols; cnt++) + tarray[cnt] = tarray[cnt] "\n" tread[cnt]; + } + block = substr(block, match(block, /(\n|$)/) + 1 ); - ttext = ttext "" - for (cnt = 1; cnt <= cols; cnt++) - ttext = ttext "" _nblock(tarray[cnt]) "" - ttext = ttext "\n" - } - return "" ttext "
    \n" _nblock(block); - - # Line Blocks (pandoc) - } else if ( match(block, /^\| [^\n]*(\n|$)(\| [^\n]*(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { - len = RLENGTH; st = RSTART; - - text = substr(block, 1, len); gsub(/\n[[:space:]]+/, " ", text); - gsub(/\n\| /, "\n", text); gsub(/^\| |\n$/, "", text); - text = inline(text); gsub(/\n/, "
    \n", text); - - return "
    " text "
    \n" _block( substr( block, len + 1) ); - - # Indented Code Block - } else if ( match(block, /^(( |\t)[^\n]*[^\n\t ][^\n]*(\n|$))(( |\t)[^\n]*(\n|$)|[\t ]*(\n|$))*/) ) { - len = RLENGTH; st = RSTART; - - code = substr(block, 1, len); - gsub(/(^|\n)( |\t)/, "\n", code); - gsub(/^\n|\n+$/, "", code); - return "
    " HTML( code ) "
    \n" \ - _block( substr( block, len + 1 ) ); - - # Fenced Divs (pandoc, custom) - } else if ( match( block, /^(:::+)/ ) ) { - guard = substr( block, 1, RLENGTH ); attrib = code = block; - sub(/^[^\n]+\n/, "", code); - sub(/^:::+[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); - # attrib = gensub(/^:::+[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\1", 1, attrib); - gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); - gsub(/(^ | $)/, "", attrib); - if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { - len = RLENGTH; st = RSTART; - return "
    " _nblock( substr(code, 1, st - 1) ) "
    \n" \ - _block( substr( code, st + len ) ); - } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { - len = RLENGTH; st = RSTART; - return "
    " _nblock( substr(code, 1, st - 1) ) "
    \n" \ - _block( substr( code, st + len ) ); - } else { - match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); - len = RLENGTH; st = RSTART; - return "

    " inline( substr(block, 1, st - 1) ) "

    \n" \ - _block( substr(block, st + len) ); - } + ttext = ttext "" + for (cnt = 1; cnt <= cols; cnt++) + ttext = ttext "" _nblock(tarray[cnt]) "" + ttext = ttext "\n" + } + return ret "" ttext "
    \n" _nblock(block); - # Fenced Code Block (pandoc) - } else if ( match( block, /^(~~~+|```+)/ ) ) { - guard = substr( block, 1, RLENGTH ); attrib = code = block; - sub(/^[^\n]+\n/, "", code); - sub(/^(~~~+|```+)[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); - # attrib = gensub(/^(~~~+|```+)[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\2", 1, attrib); - gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); - gsub(/(^ | $)/, "", attrib); - if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { + # Line Blocks (pandoc) + } else if ( match(block, /^\| [^\n]*(\n|$)(\| [^\n]*(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; - return "
    " HTML( substr(code, 1, st - 1) ) "
    \n" \ - _block( substr( code, st + len ) ); - } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { - len = RLENGTH; st = RSTART; - return "
    " HTML( substr(code, 1, st - 1) ) "
    \n" \ - _block( substr( code, st + len ) ); - } else { - match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); + + text = substr(block, 1, len); gsub(/\n[[:space:]]+/, " ", text); + gsub(/\n\| /, "\n", text); gsub(/^\| |\n$/, "", text); + text = inline(text); gsub(/\n/, "
    \n", text); + + ret = ret "
    " text "
    \n"; block = substr( block, len + 1); + continue; + + # Indented Code Block + } else if ( match(block, /^(( |\t)[^\n]*[^\n\t ][^\n]*(\n|$))(( |\t)[^\n]*(\n|$)|[\t ]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; - return "

    " inline( substr(block, 1, st - 1) ) "

    \n" \ - _block( substr(block, st + len) ); - } - # First Order Heading H1 + Attrib - } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n===+(\n|$)/ ) ) { - len = RLENGTH; text = attrib = block; - sub(/([ \t]*\{([^\}\n]+)\})\n===+(\n.*)?$/, "", text); - sub(/\}\n===+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); - gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); + code = substr(block, 1, len); + gsub(/(^|\n)( |\t)/, "\n", code); + gsub(/^\n|\n+$/, "", code); + ret = ret "
    " HTML( code ) "
    \n"; block = substr( block, len + 1 ); + continue; - return headline(1, text, attrib) _block( substr( block, len + 1 ) ); + # Fenced Divs (pandoc, custom) + } else if ( match( block, /^(:::+)/ ) ) { + guard = substr( block, 1, RLENGTH ); attrib = code = block; + sub(/^[^\n]+\n/, "", code); + sub(/^:::+[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); + # attrib = gensub(/^:::+[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\1", 1, attrib); + gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); + gsub(/(^ | $)/, "", attrib); + if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { + len = RLENGTH; st = RSTART; + ret = ret "
    " _nblock( substr(code, 1, st - 1) ) "
    \n"; + block = substr( code, st + len ); + continue; - # First Order Heading H1 - } else if ( match( block, /^([^\n]+)\n===+(\n|$)/ ) ) { - len = RLENGTH; text = substr(block, 1, len); - sub(/\n===+(\n.*)?$/, "", text); + } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { + len = RLENGTH; st = RSTART; + ret = ret "
    " _nblock( substr(code, 1, st - 1) ) "
    \n"; block = substr( code, st + len ); + continue; + + } else { + match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); + len = RLENGTH; st = RSTART; + ret = ret "

    " inline( substr(block, 1, st - 1) ) "

    \n"; block = substr(block, st + len); + continue; + } - return headline(1, text, 0) _block( substr( block, len + 1 ) ); + # Fenced Code Block (pandoc) + } else if ( match( block, /^(~~~+|```+)/ ) ) { + guard = substr( block, 1, RLENGTH ); attrib = code = block; + sub(/^[^\n]+\n/, "", code); + sub(/^(~~~+|```+)[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); + # attrib = gensub(/^(~~~+|```+)[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\2", 1, attrib); + gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); + gsub(/(^ | $)/, "", attrib); + if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { + len = RLENGTH; st = RSTART; + ret = ret "
    " HTML( substr(code, 1, st - 1) ) "
    \n"; + block = substr( code, st + len ); + continue; - # Second Order Heading H2 + Attrib - } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n---+(\n|$)/ ) ) { - len = RLENGTH; text = attrib = block; - sub(/([ \t]*\{([^\}\n]+)\})\n---+(\n.*)?$/, "", text); - sub(/\}\n---+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); - gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); + } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { + len = RLENGTH; st = RSTART; + ret = ret "
    " HTML( substr(code, 1, st - 1) ) "
    \n"; + block = substr( code, st + len ); + continue; - return headline(2, text, attrib) _block( substr( block, len + 1) ); + } else { + match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); + len = RLENGTH; st = RSTART; + ret = ret "

    " inline( substr(block, 1, st - 1) ) "

    \n"; block = substr(block, st + len); + continue; + } - # Second Order Heading H2 - } else if ( match( block, /^([^\n]+)\n---+(\n|$)/ ) ) { - len = RLENGTH; text = substr(block, 1, len); - sub(/\n---+(\n.*)?$/, "", text); + # First Order Heading H1 + Attrib + } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n===+(\n|$)/ ) ) { + len = RLENGTH; text = attrib = block; + sub(/([ \t]*\{([^\}\n]+)\})\n===+(\n.*)?$/, "", text); + sub(/\}\n===+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); + gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); - return headline(2, text, 0) _block( substr( block, len + 1) ); + ret = ret headline(1, text, attrib) ; block = substr( block, len + 1 ); + continue; - # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib - } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n|$)/ ) ) { - len = RLENGTH; text = attrib = substr(block, 1, len); - match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; + # First Order Heading H1 + } else if ( match( block, /^([^\n]+)\n===+(\n|$)/ ) ) { + len = RLENGTH; text = substr(block, 1, len); + sub(/\n===+(\n.*)?$/, "", text); - # sub(/^(##?#?#?#?#?)[ \t]*/, "", text); # not working in mawk - text = substr(text, n + 1); sub(/^[ \t]*/, "", text); - sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); - sub(/^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); - sub(/\}(\n.*)?$/, "", attrib); - gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); + ret = ret headline(1, text, 0) ; block = substr( block, len + 1 ); + continue; - return headline( n, text, attrib ) _block( substr( block, len + 1) ); + # Second Order Heading H2 + Attrib + } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n---+(\n|$)/ ) ) { + len = RLENGTH; text = attrib = block; + sub(/([ \t]*\{([^\}\n]+)\})\n---+(\n.*)?$/, "", text); + sub(/\}\n---+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); + gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); - # Nth Order Heading H1 H2 H3 H4 H5 H6 - } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { - len = RLENGTH; text = substr(block, 1, len); - match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; - # sub(/^(##?#?#?#?#?)[ \t]+/, "", text); # not working in mawk - text = substr(text, n + 1); sub(/^[ \t]*/, "", text); - sub(/[ \t]*#*(\n.*)?$/, "", text); - - return headline( n, text, 0 ) _block( substr( block, len + 1) ); - - # block images (wrapped in
    ) - } else if ( match(block, "^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n|$)") ) { - len = RLENGTH; text = href = title = attrib = substr( block, 1, len); - - sub("^!\\[", "", text); - sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", text); - - sub("^!" lix "\\([\n\t ]*", "", href); - sub("([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", href); - - sub("^!" lix "\\([\n\t ]*" lid, "", title); - sub("[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", title); - sub("^[\n\t ]+", "", title); - - sub("^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)", "", attrib); - sub("(\n.*)?$", "", attrib); - sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); - - if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } - if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } - else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } - else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } - - gsub(/^[\t ]+$/, "", text); gsub(/\\/, "", href); - - return "
    " \ - "\""" \ - (title?"
    " inline(title) "
    ":"") \ - "
    \n\n" \ - _block( substr( block, len + 1) ); - - } else if ( match(block, /^!\[([^]]*)\] ?\[([^]]*)\](\n|$)/ ) ) { - len = RLENGTH; text = id = block; - sub(/(\n.*)?$/, "", text); sub( /^!\[/, "", text); sub(/\] ?\[([^\n]*)\]$/, "", text); - sub(/(\n.*)?$/, "", id); sub( /^!\[([^\n]*)\] ?\[/, "", id); sub(/\]$/, "", id); - # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\1", 1, block); - # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\2", 1, block); - if ( ! id ) id = text; - if ( rl_href[id] && rl_title[id] ) { - return "
    " \ - "\""" \ - "
    " inline(rl_title[id]) "
    " \ - "
    \n\n" \ - _block( substr( block, len + 1) ); - } else if ( rl_href[id] ) { - return "
    " \ - "\""" \ - "
    \n\n" \ - _block( substr( block, len + 1) ); + ret = ret headline(2, text, attrib) ; block = substr( block, len + 1); + continue; + + # Second Order Heading H2 + } else if ( match( block, /^([^\n]+)\n---+(\n|$)/ ) ) { + len = RLENGTH; text = substr(block, 1, len); + sub(/\n---+(\n.*)?$/, "", text); + + ret = ret headline(2, text, 0) ; block = substr( block, len + 1); + continue; + + # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib + } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n|$)/ ) ) { + len = RLENGTH; text = attrib = substr(block, 1, len); + match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; + + # sub(/^(##?#?#?#?#?)[ \t]*/, "", text); # not working in mawk + text = substr(text, n + 1); sub(/^[ \t]*/, "", text); + sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); + sub(/^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); + sub(/\}(\n.*)?$/, "", attrib); + gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); + + ret = ret headline( n, text, attrib ) ; block = substr( block, len + 1); + continue; + + # Nth Order Heading H1 H2 H3 H4 H5 H6 + } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { + len = RLENGTH; text = substr(block, 1, len); + match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; + # sub(/^(##?#?#?#?#?)[ \t]+/, "", text); # not working in mawk + text = substr(text, n + 1); sub(/^[ \t]*/, "", text); + sub(/[ \t]*#*(\n.*)?$/, "", text); + + ret = ret headline( n, text, 0 ) ; block = substr( block, len + 1); + continue; + + # block images (wrapped in
    ) + } else if ( match(block, "^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n|$)") ) { + len = RLENGTH; text = href = title = attrib = substr( block, 1, len); + + sub("^!\\[", "", text); + sub("\\]\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", text); + + sub("^!" lix "\\([\n\t ]*", "", href); + sub("([\n\t ]+" lit ")?[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", href); + + sub("^!" lix "\\([\n\t ]*" lid, "", title); + sub("[\n\t ]*\\)(\\{[a-zA-Z \t-]*\\})?(\n.*)?$", "", title); + sub("^[\n\t ]+", "", title); + + sub("^!" lix "\\([\n\t ]*" lid "([\n\t ]+" lit ")?[\n\t ]*\\)", "", attrib); + sub("(\n.*)?$", "", attrib); + sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/^[\t ]+$/, "", text); gsub(/\\/, "", href); + + ret = ret "
    " \ + "\""" \ + (title?"
    " inline(title) "
    ":"") \ + "
    \n\n"; + block = substr( block, len + 1); + continue; + + } else if ( match(block, /^!\[([^]]*)\] ?\[([^]]*)\](\n|$)/ ) ) { + len = RLENGTH; text = id = block; + sub(/(\n.*)?$/, "", text); sub( /^!\[/, "", text); sub(/\] ?\[([^\n]*)\]$/, "", text); + sub(/(\n.*)?$/, "", id); sub( /^!\[([^\n]*)\] ?\[/, "", id); sub(/\]$/, "", id); + # text = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\1", 1, block); + # id = gensub(/^!\[([^\n]*)\] ?\[([^\n]*)\](\n.*)?$/, "\\2", 1, block); + if ( ! id ) id = text; + if ( rl_href[id] && rl_title[id] ) { + ret = ret "
    " \ + "\""" \ + "
    " inline(rl_title[id]) "
    " \ + "
    \n\n"; + block = substr( block, len + 1); + continue; + + } else if ( rl_href[id] ) { + ret = ret "
    " \ + "\""" \ + "
    \n\n"; + block = substr( block, len + 1); + continue; + } else { + ret = ret "

    " HTML(substr(block, 1, len)) "

    \n" ; block = substr(block, len + 1); + continue; + } + + # Macros (standalone <> calls handled as block, so they are not wrapped in paragraph) + } else if ( match( block, /^<<(([^>]|>[^>])+)>>(\n|$)/ ) ) { + len = RLENGTH; text = block; + sub(/^<>(\n.*)?$/, "", text); + # text = gensub(/^<<(([^>]|>[^>])+)>>(\n.*)?$/, "\\1", 1, block); + ret = ret "" HTML(text) "" ; block = substr(block, len + 1); + continue; + + # Definition list + } else if (match( block, "^(([ \t]*\n)*[^:\n \t][^\n]+\n" \ + "([ \t]*\n)* ? ? ?:[ \t][^\n]+(\n|$)" \ + "(([ \t]*\n)* ? ? ?:[ \t][^\n]+(\n|$)" \ + "|[^:\n \t][^\n]+(\n|$)" \ + "|( ? ? ?\t| +)[^\n]+(\n|$)" \ + "|([ \t]*\n)+( ? ? ?\t| +)[^\n]+(\n|$))*)+" \ + )) { + list = substr( block, 1, RLENGTH); block = substr( block, RLENGTH + 1); + ret = ret "
    \n" _dlist( list ) "
    \n"; + continue; + + # Unordered list types + } else if ( text = _startlist( block, "ul", "-", "([+*•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ul", "\\+", "([-*•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ul", "\\*", "([-+•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ul", "•", "([-+*]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { + return ret text; + + # Ordered list types + } else if ( text = _startlist( block, "ol", "[0-9]+\\.", "([-+*•]|#\\.|[0-9]+\\)|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ol", "[0-9]+\\)", "([-+*•]|[0-9]+\\.|#\\.|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ol", "#\\.", "([-+*•]|[0-9]+\\.|[0-9]+\\)|#\\))") ) { + return ret text; + } else if ( text = _startlist( block, "ol", "#\\)", "([-+*•]|[0-9]+\\.|#\\.|[0-9]+\\))") ) { + return ret text; + + # Split paragraphs + } else if ( match( block, /(^|\n)[[:space:]]*(\n|$)/) ) { + len = RLENGTH; st = RSTART; + ret = ret _block( substr(block, 1, st - 1) ) "\n"; block = substr(block, st + len); + continue; + + # Horizontal rule + # } else if ( match( block, /(^|\n) ? ? ?((\* *){3,}|(- *){3,}|(_ *){3,})($|\n)/) ) { + } else if ( match( block, /(^|\n) ? ? ?((\* *)(\* *)(\* *)(\* *)*|(- *)(- *)(- *)(- *)*|(_ *)(_ *)(_ *)(_ *)*)($|\n)/) ) { + len = RLENGTH; st = RSTART; + ret = ret _block(substr(block, 1, st - 1)) "
    \n"; block = substr(block, st + len); + continue; + + # Plain paragraph } else { - return "

    " HTML(substr(block, 1, len)) "

    \n" _block( substr(block, len + 1) ); + return ret "

    " inline(block) "

    \n"; } - - # Macros (standalone <> calls handled as block, so they are not wrapped in paragraph) - } else if ( match( block, /^<<(([^>]|>[^>])+)>>(\n|$)/ ) ) { - len = RLENGTH; text = block; - sub(/^<>(\n.*)?$/, "", text); - # text = gensub(/^<<(([^>]|>[^>])+)>>(\n.*)?$/, "\\1", 1, block); - return "" HTML(text) "" _block(substr(block, len + 1) ); - - # Definition list - } else if (match( block, "^(([ \t]*\n)*[^:\n \t][^\n]+\n" \ - "([ \t]*\n)* ? ? ?:[ \t][^\n]+(\n|$)" \ - "(([ \t]*\n)* ? ? ?:[ \t][^\n]+(\n|$)" \ - "|[^:\n \t][^\n]+(\n|$)" \ - "|( ? ? ?\t| +)[^\n]+(\n|$)" \ - "|([ \t]*\n)+( ? ? ?\t| +)[^\n]+(\n|$))*)+" \ - )) { - list = substr( block, 1, RLENGTH); block = substr( block, RLENGTH + 1); - return "
    \n" _dlist( list ) "
    \n" _block( block ); - - # Unordered list types - } else if ( text = _startlist( block, "ul", "-", "([+*•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ul", "\\+", "([-*•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ul", "\\*", "([-+•]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ul", "•", "([-+*]|[0-9]+\\.|#\\.|[0-9]+\\)|#\\))") ) { - return text; - - # Ordered list types - } else if ( text = _startlist( block, "ol", "[0-9]+\\.", "([-+*•]|#\\.|[0-9]+\\)|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ol", "[0-9]+\\)", "([-+*•]|[0-9]+\\.|#\\.|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ol", "#\\.", "([-+*•]|[0-9]+\\.|[0-9]+\\)|#\\))") ) { - return text; - } else if ( text = _startlist( block, "ol", "#\\)", "([-+*•]|[0-9]+\\.|#\\.|[0-9]+\\))") ) { - return text; - - # Split paragraphs - } else if ( match( block, /(^|\n)[[:space:]]*(\n|$)/) ) { - len = RLENGTH; st = RSTART; - return _block( substr(block, 1, st - 1) ) "\n" \ - _block( substr(block, st + len) ); - - # Horizontal rule - # } else if ( match( block, /(^|\n) ? ? ?((\* *){3,}|(- *){3,}|(_ *){3,})($|\n)/) ) { - } else if ( match( block, /(^|\n) ? ? ?((\* *)(\* *)(\* *)(\* *)*|(- *)(- *)(- *)(- *)*|(_ *)(_ *)(_ *)(_ *)*)($|\n)/) ) { - len = RLENGTH; st = RSTART; - return _block(substr(block, 1, st - 1)) "
    \n" _block(substr(block, st + len)); - - # Plain paragraph - } else { - return "

    " inline(block) "

    \n"; } + return ret; } function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, it, text) { -- 2.39.5 From 98df52bf69b6f2d838264902de21369013602102 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 19:18:04 +0200 Subject: [PATCH 08/16] modify regexes for use in mawk --- markdown.awk | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/markdown.awk b/markdown.awk index fc2f203..6e3febe 100755 --- a/markdown.awk +++ b/markdown.awk @@ -701,23 +701,25 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, ret = ret headline(2, text, 0) ; block = substr( block, len + 1); continue; - # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib - } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n|$)/ ) ) { + # # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib + # } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{[a-zA-Z \t-]*\}(\n|$)/ ) ) { + } else if ( match( block, /^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*[\t ]*\{[\ta-zA-Z -]*\}(\n|$)/ ) ) { len = RLENGTH; text = attrib = substr(block, 1, len); match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; - # sub(/^(##?#?#?#?#?)[ \t]*/, "", text); # not working in mawk text = substr(text, n + 1); sub(/^[ \t]*/, "", text); sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); - sub(/^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{/, "", attrib); + + sub(/^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*[\t ]*\{/, "", attrib); sub(/\}(\n.*)?$/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); - ret = ret headline( n, text, attrib ) ; block = substr( block, len + 1); + ret = ret headline( n, text, attrib ); block = substr( block, len + 1); continue; # Nth Order Heading H1 H2 H3 H4 H5 H6 - } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { + # } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { + } else if ( match( block, /^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*(\n|$)/ ) ) { len = RLENGTH; text = substr(block, 1, len); match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; # sub(/^(##?#?#?#?#?)[ \t]+/, "", text); # not working in mawk @@ -854,10 +856,12 @@ function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, it "|[^\n \t][^\n]+(\n|$))*" ) ) { st = RSTART; len = RLENGTH; list = substr( block, st, len); - sub("^\n", "", list); match(list, "^ ? ? ?"); indent = RLENGTH; - it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } + sub("^\n", "", list); match(list, "^( | | |)"); indent = RLENGTH; # gsub( "(^|\n) {0," indent "}", "\n", list); sub("^\n", "", list); - gsub( "(^|\n)" it, "\n", list); sub("^\n", "", list); + # emulate greedy range matcher for mawk + it = "("; while ( indent > 0 ) { for (k = indent; k > 0; k--) { it = it " "; } it = it "|"; indent--; } + sub(/\|$/, ")?", it); sub(/^\($/, "", it); + gsub( "(^|\n)" it, "\n", list ); sub("^\n", "", list); text = substr(block, 1, st - 1); block = substr(block, st + len); if (match(text, /\n[[:space:]]*\n/)) return 0; @@ -875,7 +879,6 @@ function _list (block, mark, p, LOCAL, len, st, text, indent, it, task) { if ( match(block, "^([ \t]*\n)*$")) return; match(block, "^" mark "[ \t]"); indent = RLENGTH; - it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } sub("^" mark "[ \t]", "", block); @@ -886,6 +889,9 @@ function _list (block, mark, p, LOCAL, len, st, text, indent, it, task) { text = substr(block, 1, st); block = substr(block, st + 1); # gsub("\n {0," indent "}", "\n", text); + # emulate greedy range matcher for mawk + it = "("; while ( indent > 0 ) { for (k = indent; k > 0; k--) { it = it " "; } it = it "|"; indent--; } + sub(/\|$/, ")?", it); sub(/^\($/, "", it); gsub("\n" it, "\n", text); task = match( text, /^\[ \]/ ) ? "
  • " : \ @@ -915,9 +921,11 @@ function _dlist (block, LOCAL, len, st, text, indent, it, p) { len = RLENGTH; text = substr(block, 1, len); sub( "^([ \t]*\n)*", "", text); match(text, "^ ? ? ?:(\t| +)"); indent = RLENGTH; - it = ""; while ( indent > 0 ) { it = it " ?"; indent--; } sub( "^ ? ? ?:(\t| +)", "", text); # gsub( "(^|\n) {0," indent "}", "\n", text ); + # emulate greedy range matcher for mawk + it = "("; while ( indent > 0 ) { for (k = indent; k > 0; k--) { it = it " "; } it = it "|"; indent--; } + sub(/\|$/, ")?", it); sub(/^\($/, "", it); gsub( "(^|\n)" it, "\n", text ); text = _nblock(text); -- 2.39.5 From 42028f563c652ea0cbdd069940b09e58771ea4e5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 21:42:54 +0200 Subject: [PATCH 09/16] improved tests for nested emphasis --- tests-markdown.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests-markdown.sh b/tests-markdown.sh index ff56c02..2bbcf5d 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -1,6 +1,6 @@ #!/bin/sh -runtimes="gawk bawk goawk" +runtimes="gawk bawk mawk goawk" BR=' ' @@ -54,6 +54,13 @@ assert '***strem***' '

    strem

    ' "strong em" assert '***str**em*' '

    strem

    ' "em strong" assert '_**strem**_' '

    strem

    ' "em strong" +assert '*foo**str**bar**str**qua*' '

    foostrbarstrqua

    ' 'em strong asterisk' +assert '**foo*em*bar*em*qua**' '

    fooembaremqua

    ' 'strong em asterisk' + +assert '_foo__str__bar__str__qua_' '

    foo__str__bar__str__qua

    ' 'em embedded underscore' +assert '__foo_em_bar_em_qua__' '

    foo_em_bar_em_qua

    ' 'strong embedded underscore' +assert '_**str**foo**str**_' '

    strfoostr

    ' 'em strong mixed' + assert '_foo_-> bar' '

    foo→ bar

    ' 'arrow' assert '`_foo_-> bar`' '

    _foo_-> bar

    ' 'arrow' assert ' <- comment' '

    <!-- comment --> ← comment

    ' 'arrow' -- 2.39.5 From 898d470f90e4055d0bcfe616bc009cca8d7f5692 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Fri, 30 Aug 2024 21:59:59 +0200 Subject: [PATCH 10/16] optimized emphasis regex for performance in mawk --- markdown.awk | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/markdown.awk b/markdown.awk index 6e3febe..6e1440c 100755 --- a/markdown.awk +++ b/markdown.awk @@ -307,6 +307,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { ret = ret HTML(substr( line, 1, RLENGTH)); line = substr(line, RLENGTH + 1); continue; + # strong / em matchers use pre match pattern to make processing cheaper # __strong__$ } else if ( match(line, "^__(([^_[:space:]]|" ieu ")|([^_[:space:]]|" ieu ")(" nu "|" ieu ")*([^_[:space:]]|" ieu "))__$") ) { len = RLENGTH; @@ -320,7 +321,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { continue; # **strong** - } else if ( match(line, "^\\*\\*(([^\\*[:space:]]|" iea ")|([^\\*[:space:]]|" iea ")(" na "|" iea ")*([^\\*[:space:]]|" iea "))\\*\\*") ) { + } else if ( match(line, "^\\*\\*(([^*[:space:]]|" iea ")|([^*[:space:]]|" iea ")(" na "|" iea ")*([^*[:space:]]|" iea "))\\*\\*") ) { len = RLENGTH; ret = ret "" inline( substr( line, 3, len - 4 ) ) ""; line = substr( line, len + 1 ); continue; @@ -338,7 +339,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { continue; # *em* - } else if ( match(line, "^\\*(([^\\*[:space:]]|" isa ")|([^\\*[:space:]]|" isa ")(" na "|" isa ")*([^\\*[:space:]]|" isa "))\\*") ) { + } else if ( match(line, "^\\*(([^*[:space:]]|" isa ")|([^*[:space:]]|" isa ")(" na "|" isa ")*([^*[:space:]]|" isa "))\\*") ) { len = RLENGTH; ret = ret "" inline( substr( line, 2, len - 2 ) ) ""; line = substr( line, len + 1 ); continue; @@ -944,12 +945,12 @@ BEGIN { # hls = "0 0 0 0 0 0"; # Universal Patterns - nu = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\_]|_[[:alnum:]])*" # not underline (except when escaped) - na = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\\\*])*" # not asterisk (except when escaped) - ieu = "_([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])_" # inner (underline) - isu = "__([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])__" # inner (underline) - iea = "\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*" # inner (asterisk) - isa = "\\*\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*\\*" # inner (asterisk) + nu = "([^_\\\\]|\\\\.|_[[:alnum:]])" # not underline (except when escaped, or inside a word) + na = "([^*\\\\]|\\\\.)" # not asterisk (except when escaped) + ieu = "_([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])_" # inner (underline) + isu = "__([^_[:space:]]|[^_[:space:]]" nu "*[^_[:space:]])__" # inner (underline) + iea = "\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*" # inner (asterisk) + isa = "\\*\\*([^*[:space:]]|[^*[:space:]]" na "*[^*[:space:]])\\*\\*" # inner (asterisk) lix="\\[(\\\\[^\n]|[^]\n\\\\[])*\\]" # link text lid="(<(\\\\[^\n]|[^\n<>\\\\])*>|(\\\\.|[^()\"'\\\\])+|([^<\n\t ()\\\\]|\\\\[^\n])(\\\\[\n]|[^\n\t \\(\\)\\\\])*)" # link dest -- 2.39.5 From 9b019ee3d845d1edac5a6a33ebf385b120f136f8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Sat, 31 Aug 2024 19:24:38 +0200 Subject: [PATCH 11/16] simpler reex for matching email links (again, for portability ) --- markdown.awk | 2 +- tests-markdown.sh | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/markdown.awk b/markdown.awk index 6e1440c..ad8c438 100755 --- a/markdown.awk +++ b/markdown.awk @@ -175,7 +175,7 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { # quick link email # } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>/ ) ) { - } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@[a-zA-Z0-9]([a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9])?(\.[a-zA-Z0-9]([a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9-]?[a-zA-Z0-9])?)*>/ ) ) { + } else if ( match( line, /^<[a-zA-Z0-9.!#$%&'\''*+\/=?^_`{|}~-]+@([a-zA-Z0-9]\.[a-zA-Z0-9]|[a-zA-Z0-9-])+>/ ) ) { len = RLENGTH; href = HTML( substr( line, 2, len - 2) ); ret = ret "" href ""; line = substr( line, len + 1); diff --git a/tests-markdown.sh b/tests-markdown.sh index 2bbcf5d..a834391 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -80,6 +80,9 @@ assert '' "

    htt assert '' "

    http://de.wikipedia.org

    " "automatic link" # assert '' "

    http://de.wikipedia.org

    " "automatic link" +assert '' "

    hello&goodbye@sub-test.example.com

    " "automatic link, email" +# assert '' "

    hällö&guttbei@sub-test.example.com

    " "automatic link, email" + # Inline Links assert '[Wikipedia](http://de.wikipedia.org)' "

    Wikipedia

    " "inline link" assert '[Wikipedia](http://de.wikipedia.org "Online Encyclopedia")' "

    Wikipedia

    " "inline link" -- 2.39.5 From 32b4555b66c086805df962e68070956c4c1780d7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Sat, 31 Aug 2024 21:53:16 +0200 Subject: [PATCH 12/16] split/modify some regexes for compatibility with old mawk implementations --- markdown.awk | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/markdown.awk b/markdown.awk index ad8c438..356a987 100755 --- a/markdown.awk +++ b/markdown.awk @@ -347,7 +347,11 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { # Literal HTML entities # } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { # mawk does not support repitition ranges - } else if ( match( line, /^&([a-zA-Z][a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?|#[0-9][0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?|#[xX][0-9a-fA-F][0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?);/) ) { + } else if ( match( line, /^&[a-zA-Z][a-zA-Z][a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?[a-zA-Z]?;/) ) { + len = RLENGTH; + ret = ret substr( line, 1, len ); line = substr(line, len + 1); + continue; + } else if ( match( line, /^&(#[0-9][0-9]?[0-9]?[0-9]?[0-9]?[0-9]?[0-9]?|#[xX][0-9a-fA-F][0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?[0-9a-fA-F]?);/) ) { len = RLENGTH; ret = ret substr( line, 1, len ); line = substr(line, len + 1); continue; @@ -425,8 +429,14 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; - # HTML #6 - } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset|h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { + # HTML #6 (part1) + } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { + len = RLENGTH; st = RSTART; + ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); + continue; + + # HTML #6 (part2) + } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { len = RLENGTH; st = RSTART; ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; @@ -516,8 +526,7 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, "((\\|([^\n]+\\|)+\n)+" \ "\\+(-+\\+)+(\n|$))+" \ ) || \ - match( block, "^()()()" \ - "(\\+(:?-+:?\\+)+)\n" \ + match( block, "^(\\+(:?-+:?\\+)+)\n" \ "((\\|([^\n]+\\|)+\n)+" \ "\\+(-+\\+)+(\n|$))+" \ ) ) { @@ -857,7 +866,7 @@ function _startlist(block, type, mark, exclude, LOCAL, st, len, list, indent, it "|[^\n \t][^\n]+(\n|$))*" ) ) { st = RSTART; len = RLENGTH; list = substr( block, st, len); - sub("^\n", "", list); match(list, "^( | | |)"); indent = RLENGTH; + sub("^\n", "", list); match(list, "^( | | )?"); indent = RLENGTH; # gsub( "(^|\n) {0," indent "}", "\n", list); sub("^\n", "", list); # emulate greedy range matcher for mawk it = "("; while ( indent > 0 ) { for (k = indent; k > 0; k--) { it = it " "; } it = it "|"; indent--; } -- 2.39.5 From 290cc67245726ca72bb8450162cb2adc5a8c78d1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Wed, 9 Oct 2024 07:10:39 +0200 Subject: [PATCH 13/16] bugfix: avoid endless loops for some incomplete matches --- markdown.awk | 15 +++++++-------- tests-markdown.sh | 13 +++++++++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/markdown.awk b/markdown.awk index 356a987..bef97d1 100755 --- a/markdown.awk +++ b/markdown.awk @@ -372,11 +372,10 @@ function inline( line, LOCAL, len, text, code, href, guard, ret ) { ret = ret HTML(substr(line, 1, 1)); line = substr(line, 2); continue; - # continue walk over string - } else { - ret = ret substr(line, 1, 1); line = substr(line, 2); - continue; - } + } # inline patterns end + + # continue walk over string + ret = ret substr(line, 1, 1); line = substr(line, 2); } return ret; } @@ -851,10 +850,10 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, ret = ret _block(substr(block, 1, st - 1)) "
    \n"; block = substr(block, st + len); continue; + } # block patterns end + # Plain paragraph - } else { - return ret "

    " inline(block) "

    \n"; - } + return ret "

    " inline(block) "

    \n"; } return ret; } diff --git a/tests-markdown.sh b/tests-markdown.sh index a834391..6df4224 100755 --- a/tests-markdown.sh +++ b/tests-markdown.sh @@ -1,6 +1,6 @@ #!/bin/sh -runtimes="gawk bawk mawk goawk" +runtimes="gawk busybox mawk goawk" BR=' ' @@ -9,7 +9,7 @@ fail() { printf '%s\n' "$@"; exit 1; } awk() { /bin/awk "$@"; } md_gawk() { gawk -f markdown.awk "$@"; } -md_bawk() { busybox awk -f markdown.awk "$@"; } +md_busybox() { busybox awk -f markdown.awk "$@"; } md_mawk() { mawk -f markdown.awk "$@"; } md_goawk() { goawk -f markdown.awk "$@"; } @@ -45,6 +45,7 @@ assert '~sub~' '

    sub

    ' "subscript" assert "foo ${BR}bar" "

    foo
    ${BR}bar

    " 'double space line break' assert '```©```' "

    &copy;

    " "code span escape" +assert '````' "
    ````
    " "empty code span" assert '_emphasized text_' '

    emphasized text

    ' "em" assert '_emphasized_text_' '

    emphasized_text

    ' "em" @@ -190,6 +191,14 @@ not be but &shy; <escaped>' \ "fenced code block" +assert 'foobar +```` +foobar' \ +'

    foobar +```` +foobar

    ' \ +"Open Fence" + # Block Images assert '![Testbild](Test Bild.jpg)' \ '
    Testbild
    ' \ -- 2.39.5 From ac4031bc57b6ae56ecaba5af2b0f9cfd5a13ceb2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Sun, 6 Apr 2025 12:03:07 +0200 Subject: [PATCH 14/16] allow `check` function for field indexes --- db23.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/db23.sh b/db23.sh index e8a0d64..8ee6f4f 100755 --- a/db23.sh +++ b/db23.sh @@ -31,8 +31,12 @@ DB2() { open|load) file="$1" cat "$file" || return 1 ;; - check|contains) key="$(STRING "$1")" val='' + check|contains) key="$(STRING "$1")" seq="${2:-1}" val="${data##*"${BR}${key}" }" val="${val%%"${BR}"*}" + [ "$val" = '' ] && return 1 || val="${val} " + while [ $seq -gt 1 ]; do + seq=$((seq - 1)) val="${val#* }" + done [ "$val" = '' ] && return 1 ;; count) key="$(STRING "$1")" val='' seq=0 -- 2.39.5 From b8dae070a1652d8a46a8eb81eaf4fc82a1a3b8de Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Mon, 5 May 2025 11:39:14 +0200 Subject: [PATCH 15/16] updated copyright info --- markdown.awk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markdown.awk b/markdown.awk index bef97d1..90dddbf 100755 --- a/markdown.awk +++ b/markdown.awk @@ -5,7 +5,7 @@ # Meant to support all features of John Grubers basic Markdown # + a number of common extensions, mostly inspired by Pandoc Markdown -# Copyright 2021 - 2023 Paul Hänsch +# Copyright 2021 - 2024 Paul Hänsch # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above -- 2.39.5 From 0c39114d102cb1d523b2b7b63f7d9caba4cf441c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Mon, 26 May 2025 21:30:20 +0200 Subject: [PATCH 16/16] bugfix: prevent content duplication when deleting nonexist key --- db23.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/db23.sh b/db23.sh index 8ee6f4f..b7ab548 100755 --- a/db23.sh +++ b/db23.sh @@ -65,8 +65,12 @@ DB2() { delete|remove) key="$(STRING "$1")" val="${data#*"${BR}${key}" *"${BR}"}" key="${data%"${BR}${key}" *"${BR}"*}" - [ "${key}${BR}${val}" = "${data}" ] && return 1 - printf '%s' "${key#"${BR}"}${BR}${val%"${BR}"}" + if [ "${val}" = "${data}" ]; then + printf %s\\n "${data}" + return 1 + else + printf '%s' "${key#"${BR}"}${BR}${val%"${BR}"}" + fi ;; set|store) key="$(STRING "$1")" val="" shift 1 -- 2.39.5