#!/bin/awk -f #!/opt/busybox/awk -f # EXPERIMENTAL Markdown processor with minimal dependencies. # Meant to support all features of John Grubers basic Markdown # + a number of common extensions, mostly inspired by Pandoc Markdown # Copyright 2021 - 2024 Paul Hänsch # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR # IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # Supported Features / TODO: # ========================== # [x] done [ ] todo [-] not planned ? unsure # # Basic Markdown - Block elements: # ------------------------------- # - [x] Paragraphs # - [x] Double space line breaks # - [x] Proper block element nesting # - [x] Headings # - [x] ATX-Style Headings # - [x] Blockquotes # - [x] Lists (ordered, unordered) # - [x] Code blocks (using indention) # - [x] Horizontal rules # - [x] Verbatim HTML block (disabled by default) # # Basic Markdown - Inline elements: # --------------------------------- # - [x] Links # - [x] Reference style links # - [x] Emphasis *em*/**strong** (*Asterisk*, _Underscore_) # - [x] `code`, also ``code containing `backticks` `` # - [x] Images / reference style images # - [x] # - [x] backslash escapes # - [x] Verbatim HTML inline (disabled by default) # - [x] HTML escaping # # NOTE: Set the environment variable MD_HTML=true to enable verbatim HTML # # Extensions - Block elements: # ---------------------------- # - [x] Automatic

-wrapping (custom) # - ? Heading identifiers (php md, pandoc) # - [x] Heading attributes (custom) # - [ ]

terminates section # - [x] Automatic heading identifiers (custom) # - [x] Fenced code blocks (php md, pandoc) # - [x] Fenced code attributes # - [x] Images (as block elements,

":""); } HL[hlvl]++; for ( n = hlvl + 1; n <= 6; n++) { HL[n] = 0;} hid = ""; for ( n = 2; n <= blvl; n++) { hid = hid BL[n] "/"; } hid = hid HL[1]; for ( n = 2; n <= hlvl; n++) { hid = hid "." HL[n] ; } hid = hid ":" URL(htxt, 1); # sub(/([0-9]+( [0-9]+){5})$/, "", hstack); sub(/([0-9]+( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+)( [0-9]+))$/, "", hstack); hstack = hstack HL[1] " " HL[2] " " HL[3] " " HL[4] " " HL[5] " " HL[6]; return sec "

" \ "" inline( htxt ) \ "" \ "\n"; } # Nested Block, resets heading counters function _nblock( block, LOCAL, sec, n ) { hstack = hstack " 0 0 0 0 0 0"; # Block Level blvl++; BL[blvl]++; for ( n = blvl + 1; n in BL; n++) { delete BL[n]; } block = _block( block ); match(hstack, /([0-9]+( [0-9]+)( [0-9]+)?( [0-9]+)?( [0-9]+)?( [0-9]+)?)$/); split( substr(hstack, RSTART), HL); sec = ""; for ( n = 1; n <= 6; n++ ) { sec = sec (HL[n]?"

":""); } sub("( +[0-9]+)( +[0-9]+)?( +[0-9]+)?( +[0-9]+)?( +[0-9]+)?( +[0-9]+)? *$", "", hstack); blvl--; return block sec; } function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, indent, list, tmp, ret) { ret = ""; while ( block != "" ) { gsub( "(^\n+|\n+$)", "", block ); # HTML #2 #3 #4 $5 if ( AllowHTML && match( block, /(^|\n) ? ? ?(|$)|<\?([^\?]|\?[^>])*(\?>|$)|]*(>|$)|])*(\]\]>|$))/) ) { len = RLENGTH; st = RSTART; ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; # HTML #6 (part1) } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(address|article|aside|base|basefont|blockquote|body|caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption|figure|footer|form|frame|frameset)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { len = RLENGTH; st = RSTART; ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; # HTML #6 (part2) } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<\/?(h[123456]|head|header|hr|html|iframe|legend|li|link|main|menu|menuitem|nav|noframes|ol|optgroup|option|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)([[:space:]\n>]|\/>)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { len = RLENGTH; st = RSTART; ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; # HTML #1 } else if ( AllowHTML && match( tolower(block), /(^|\n) ? ? ?<(script|pre|style)([[:space:]\n>]).*(<\/script>|<\/pre>|<\/style>|$)/) ) { len = RLENGTH; st = RSTART; match( tolower(substr(block, st, len)), /(<\/script>|<\/pre>|<\/style>)/); len = RSTART + RLENGTH; ret = ret _block(substr(block, 1, st - 1)) substr(block, st, len); block = substr(block, st + len); continue; # HTML #7 } else if ( AllowHTML && match( block, /^ ? ? ?(<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)([[:space:]]*\n)([^\n]|\n[ \t]*[^\n])*(\n[[:space:]]*\n|$)/) ) { len = RLENGTH; st = RSTART; ret = ret substr(block, st, len); block = substr(block, st + len); continue; # Metadata (custom, block starting with %something) # Metadata is ignored but can be interpreted externally } else if ( match(block, /^%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)(%[a-zA-Z-]+([[:space:]][^\n]*)?(\n|$)|%([[:space:]][^\n]*)?(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; block = substr( block, len + 1); continue; # Blockquote (leading >) } else if ( match( block, /^> /) ) { match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match(block, /$/); len = RLENGTH; st = RSTART; text = substr(block, 1, st - 1); gsub( /(^|\n)> /, "\n", text ); text = _nblock( text ); gsub( /^\n|\n$/, "", text ) ret = ret "

" text "

\n\n"; block = substr(block, st + len); continue; # Pipe Tables (pandoc / php md / gfm ) } else if ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?)\n" \ "((\\|)?(:?-+:?[\\|+])+:?-+:?(\\|)?)\n" \ "((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ) { len = RLENGTH; st = RSTART; #initialize empty arrays split("", talign); split("", tarray); cols = 0; cnt=0; ttext = ""; # table header and alignment tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); gsub( /(^\||\|$)/, "", tmp) split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /(^\||\|$)/, "", tmp ); cols = split( tmp , talign, /[+\|]/); block = substr(block, match(block, /(\n|$)/) + 1 ); for( cnt = 1; cnt < cols; cnt++ ) { if (match(talign[cnt], /:-+:/)) talign[cnt]="center"; else if (match(talign[cnt], /-+:/)) talign[cnt]="right"; else if (match(talign[cnt], /:-+/)) talign[cnt]="left"; else talign[cnt]=""; } ttext = "\n" for (cnt = 1; cnt < cols; cnt++) ttext = ttext "" inline(tarray[cnt]) "" ttext = ttext "\n\n" while ( match(block, "^((\\|)?([^\n]+\\|)+[^\n]+(\\|)?(\n|$))+" ) ){ tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /(^|[^\\])\\\|/, "\\1\\|", tmp ); gsub( /(^\||\|$)/, "", tmp ); split( tmp, tarray, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); ttext = ttext "" for (cnt = 1; cnt < cols; cnt++) ttext = ttext "" inline(tarray[cnt]) "" ttext = ttext "\n" } ret = ret "" ttext "

\n"; continue; # Grid Tables (pandoc) # (with, and without header) } else if ( match( block, "^\\+(-+\\+)+\n" \ "(\\|([^\n]+\\|)+\n)+" \ "(\\+(:?=+:?\\+)+)\n" \ "((\\|([^\n]+\\|)+\n)+" \ "\\+(-+\\+)+(\n|$))+" \ ) || \ match( block, "^(\\+(:?-+:?\\+)+)\n" \ "((\\|([^\n]+\\|)+\n)+" \ "\\+(-+\\+)+(\n|$))+" \ ) ) { len = RLENGTH; st = RSTART; #initialize empty arrays split("", talign); split("", tarray); split("", tread); cols = 0; cnt=0; ttext = ""; # Column Count tmp = block; sub( "(\n.*)*$", "", tmp); cols = split( tmp, tread, /\+/) - 2; # debug(" Cols: " gensub( "^(\\+(:?-+:?\\+)+)(\n.*)*$", "\\1", 1, block )); # table alignment match(block, "((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)"); split( substr(block, RSTART, RLENGTH) , talign, /\+/ ); # split( gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block ), talign, /\+/ ); # debug("Align: " gensub( "^(.*\n)?\\+((:?=+:?\\+|(:-+|-+:|:-+:)\\+)+)(\n.*)$", "\\2", "g", block )); for (cnt = 1; cnt <= cols; cnt++) { if (match(talign[cnt], /:(-+|=+):/)) talign[cnt]="center"; else if (match(talign[cnt], /(-+|=+):/)) talign[cnt]="right"; else if (match(talign[cnt], /:(-+|=+)/ )) talign[cnt]="left"; else talign[cnt]=""; } if ( match(block, "^\\+(-+\\+)+\n" \ "(\\|([^\n]+\\|)+\n)+" \ "\\+(:?=+:?\\+)+\n" \ "((\\|([^\n]+\\|)+\n)+" \ "\\+(-+\\+)+(\n|$))+" \ ) ) { # table header block = substr(block, match(block, /(\n|$)/) + 1 ); while ( match(block, "^\\|([^\n]+\\|)+\n") ) { tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); gsub( /(^\||\|$)/, "", tmp ); split(tmp, tread, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); for (cnt = 1; cnt <= cols; cnt++) tarray[cnt] = tarray[cnt] "\n" tread[cnt]; } ttext = "\n" for (cnt = 1; cnt <= cols; cnt++) ttext = ttext "" _nblock(tarray[cnt]) "" ttext = ttext "\n" } # table body block = substr(block, match(block, /(\n|$)/) + 1 ); ttext = ttext "\n" while ( match(block, /^((\|([^\n]+\|)+\n)+\+(-+\+)+(\n|$))+/ ) ){ split("", tarray); while ( match(block, /^\|([^\n]+\|)+\n/) ) { tmp = substr(block, 1, match(block, /(\n|$)/)); gsub( /\\\\/, "\\\", tmp); gsub(/\\\|/, "\\|", tmp); gsub( /(^\||\|$)/, "", tmp); split( tmp, tread, /\|/); block = substr(block, match(block, /(\n|$)/) + 1 ); for (cnt = 1; cnt <= cols; cnt++) tarray[cnt] = tarray[cnt] "\n" tread[cnt]; } block = substr(block, match(block, /(\n|$)/) + 1 ); ttext = ttext "" for (cnt = 1; cnt <= cols; cnt++) ttext = ttext "" _nblock(tarray[cnt]) "" ttext = ttext "\n" } return ret "" ttext "

\n" _nblock(block); # Line Blocks (pandoc) } else if ( match(block, /^\| [^\n]*(\n|$)(\| [^\n]*(\n|$)|[ \t]+[^\n[:space:]][^\n]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; text = substr(block, 1, len); gsub(/\n[[:space:]]+/, " ", text); gsub(/\n\| /, "\n", text); gsub(/^\| |\n$/, "", text); text = inline(text); gsub(/\n/, "
\n", text); ret = ret "

" text "

\n"; block = substr( block, len + 1); continue; # Indented Code Block } else if ( match(block, /^(( |\t)[^\n]*[^\n\t ][^\n]*(\n|$))(( |\t)[^\n]*(\n|$)|[\t ]*(\n|$))*/) ) { len = RLENGTH; st = RSTART; code = substr(block, 1, len); gsub(/(^|\n)( |\t)/, "\n", code); gsub(/^\n|\n+$/, "", code); ret = ret "

" HTML( code ) "

\n"; block = substr( block, len + 1 ); continue; # Fenced Divs (pandoc, custom) } else if ( match( block, /^(:::+)/ ) ) { guard = substr( block, 1, RLENGTH ); attrib = code = block; sub(/^[^\n]+\n/, "", code); sub(/^:::+[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); # attrib = gensub(/^:::+[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\1", 1, attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { len = RLENGTH; st = RSTART; ret = ret "

" _nblock( substr(code, 1, st - 1) ) "

\n"; block = substr( code, st + len ); continue; } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { len = RLENGTH; st = RSTART; ret = ret "

" _nblock( substr(code, 1, st - 1) ) "

\n"; block = substr( code, st + len ); continue; } else { match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); len = RLENGTH; st = RSTART; ret = ret "

" inline( substr(block, 1, st - 1) ) "

\n"; block = substr(block, st + len); continue; } # Fenced Code Block (pandoc) } else if ( match( block, /^(~~~+|```+)/ ) ) { guard = substr( block, 1, RLENGTH ); attrib = code = block; sub(/^[^\n]+\n/, "", code); sub(/^(~~~+|```+)[ \t]*\{?[ \t]*/, "", attrib); sub(/\}?[ \t]*\n.*$/, "", attrib); # attrib = gensub(/^(~~~+|```+)[ \t]*\{?[ \t]*([^\}\n]*)\}?[ \t]*\n.*$/, "\\2", 1, attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); if ( match(code, "(^|\n)" guard "+(\n|$)" ) && attrib ) { len = RLENGTH; st = RSTART; ret = ret "

" \
                        HTML( substr(code, 1, st - 1) ) "

\n"; block = substr( code, st + len ); continue; } else if ( match(code, "(^|\n)" guard "+(\n|$)" ) ) { len = RLENGTH; st = RSTART; ret = ret "

" HTML( substr(code, 1, st - 1) ) "

\n"; block = substr( code, st + len ); continue; } else { match( block, /(^|\n)[[:space:]]*(\n|$)/ ) || match( block, /$/ ); len = RLENGTH; st = RSTART; ret = ret "

" inline( substr(block, 1, st - 1) ) "

\n"; block = substr(block, st + len); continue; } # First Order Heading H1 + Attrib } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n===+(\n|$)/ ) ) { len = RLENGTH; text = attrib = block; sub(/([ \t]*\{([^\}\n]+)\})\n===+(\n.*)?$/, "", text); sub(/\}\n===+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); ret = ret headline(1, text, attrib) ; block = substr( block, len + 1 ); continue; # First Order Heading H1 } else if ( match( block, /^([^\n]+)\n===+(\n|$)/ ) ) { len = RLENGTH; text = substr(block, 1, len); sub(/\n===+(\n.*)?$/, "", text); ret = ret headline(1, text, 0) ; block = substr( block, len + 1 ); continue; # Second Order Heading H2 + Attrib } else if ( match( block, /^([^\n]+)([ \t]*\{([^\}\n]+)\})\n---+(\n|$)/ ) ) { len = RLENGTH; text = attrib = block; sub(/([ \t]*\{([^\}\n]+)\})\n---+(\n.*)?$/, "", text); sub(/\}\n---+(\n.*)?$/, "", attrib); sub(/^([^\n]+)[ \t]*\{/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); ret = ret headline(2, text, attrib) ; block = substr( block, len + 1); continue; # Second Order Heading H2 } else if ( match( block, /^([^\n]+)\n---+(\n|$)/ ) ) { len = RLENGTH; text = substr(block, 1, len); sub(/\n---+(\n.*)?$/, "", text); ret = ret headline(2, text, 0) ; block = substr( block, len + 1); continue; # # Nth Order Heading H1 H2 H3 H4 H5 H6 + Attrib # } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*[ \t]*\{[a-zA-Z \t-]*\}(\n|$)/ ) ) { } else if ( match( block, /^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*[\t ]*\{[\ta-zA-Z -]*\}(\n|$)/ ) ) { len = RLENGTH; text = attrib = substr(block, 1, len); match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; # sub(/^(##?#?#?#?#?)[ \t]*/, "", text); # not working in mawk text = substr(text, n + 1); sub(/^[ \t]*/, "", text); sub(/[ \t]*#*([ \t]*\{([a-zA-Z \t-]*)\})(\n.*)?$/, "", text); sub(/^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*[\t ]*\{/, "", attrib); sub(/\}(\n.*)?$/, "", attrib); gsub(/[^a-zA-Z0-9_-]+/, " ", attrib); gsub(/(^ | $)/, "", attrib); ret = ret headline( n, text, attrib ); block = substr( block, len + 1); continue; # Nth Order Heading H1 H2 H3 H4 H5 H6 # } else if ( match( block, /^(##?#?#?#?#?)[ \t]*(([^ \t\n]+|[ \t]+[^ \t\n#]|[ \t]+#+[ \t]*[^ \t\n#])+)[ \t]*#*(\n|$)/ ) ) { } else if ( match( block, /^##?#?#?#?#?[^#\n]([^\n#]|#[^\t\n# ]|#[\t ]+[^\t\n ])+#*(\n|$)/ ) ) { len = RLENGTH; text = substr(block, 1, len); match(block, /^##?#?#?#?#?[^#]/); n = RLENGTH - 1; # sub(/^(##?#?#?#?#?)[ \t]+/, "", text); # not working in mawk text = substr(text, n + 1); sub(/^[ \t]*/, "", text); sub(/[ \t]*#*(\n.*)?$/, "", text); ret = ret headline( n, text, 0 ) ; block = substr( block, len + 1); continue; # block images (wrapped in