From 24ecc97dd3f20e7b2651e2b32d37732710684093 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Paul=20H=C3=A4nsch?= Date: Mon, 9 Oct 2023 01:36:51 +0200 Subject: [PATCH] Squashed 'cgilite/' changes from 880ed14..41642aa 41642aa link and image syntax allowing whitespace URLs, repace use of non-posix gensub() 31cfd89 change order of precedence (HTML binds more than link brackets) git-subtree-dir: cgilite git-subtree-split: 41642aa44923fa56c05b62210cc990474cb5bd71 --- markdown.awk | 173 ++++++++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 78 deletions(-) diff --git a/markdown.awk b/markdown.awk index 6143b73..75f1827 100755 --- a/markdown.awk +++ b/markdown.awk @@ -122,13 +122,6 @@ function URL ( text, sharp ) { } function inline( line, LOCAL, len, text, code, href, guard ) { - nu = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\_]|_[[:alnum:]])*" # not underline (except when escaped) - na = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\\\*])*" # not asterisk (except when escaped) - ieu = "_([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])_" # inner (underline) - isu = "__([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])__" # inner (underline) - iea = "\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*" # inner (asterisk) - isa = "\\*\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*\\*" # inner (asterisk) - if ( line ~ /^$/ ) { # Recursion End return ""; @@ -154,6 +147,11 @@ function inline( line, LOCAL, len, text, code, href, guard ) { return "" code "" inline( substr( line, len + 1 ) ) } + # Macros + } else if ( match( line, /^<<([^>]|>[^>])+>>/ ) ) { + len = RLENGTH; + return "" HTML( substr( line, 3, len - 4 ) ) "" inline(substr(line, len + 1)); + # Wiki style links } else if ( match( line, /^\[\[([^]|]+)(\|[^]]+)?\]\]/) ) { len = RLENGTH; @@ -174,21 +172,28 @@ function inline( line, LOCAL, len, text, code, href, guard ) { href = URL( substr( line, 2, len - 2) ); return "" href "" inline( substr( line, len + 1) ); + # Verbatim inline HTML + } else if ( AllowHTML && match( line, /^(|<\?([^\?]|\?[^>])*\?>|]*>|])*\]\]>|<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)/) ) { + len = RLENGTH; + return substr( line, 1, len) inline(substr(line, len + 1)); + # inline links - # ,_______________________Image____________________________, - } else if ( match(line, /^\[([^]]+|!\[[^]]*\]\([^"\)]+([ \t]+"[^"]+")?\)(\{[a-zA-Z \t-]*\})?)\]\(([^"\)]+)([[:space:]]+"([^"]+)")?\)/) ) { + } else if ( match(line, "^" lii "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)") ) { len = RLENGTH; - text = gensub(/^\[([^]]+|!\[[^]]*\]\([^"\)]+([ \t]+"[^"]+")?\)(\{[a-zA-Z \t-]*\})?)\]\(([^"\)]+)([[:space:]]+"([^"]+)")?\)/, \ - "\\1", 1, substr(line, 1, len) ); - href = gensub(/^\[([^]]+|!\[[^]]*\]\([^"\)]+([ \t]+"[^"]+")?\)(\{[a-zA-Z \t-]*\})?)\]\(([^"\)]+)([[:space:]]+"([^"]+)")?\)/, \ - "\\4", 1, substr(line, 1, len) ); - title = gensub(/^\[([^]]+|!\[[^]]*\]\([^"\)]+([ \t]+"[^"]+")?\)(\{[a-zA-Z \t-]*\})?)\]\(([^"\)]+)([[:space:]]+"([^"]+)")?\)/, \ - "\\6", 1, substr(line, 1, len) ); - if ( title ) { - return "" inline( text ) "" inline( substr( line, len + 1) ); - } else { - return "" inline( text ) "" inline( substr( line, len + 1) ); - } + text = href = title = substr( line, 1, len); + sub("^\\[", "", text); sub("\\]\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)$", "", text); + sub("^" lii "\\([\\n\\t ]*", "", href); sub("([\\n\\t ]+" lit ")?[\\n\\t ]*\\)$", "", href); + sub("^" lii "\\([\\n\\t ]*" lid, "", title); sub("[\\n\\t ]*\\)$", "", title); sub("^[\\n\\t ]+", "", title); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/\\/, "", href); gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); + + return "" \ + inline( text ) "" inline( substr( line, len + 1) ); # reference style links } else if ( match(line, /^\[([^]]+)\] ?\[([^]]*)\]/ ) ) { @@ -205,25 +210,32 @@ function inline( line, LOCAL, len, text, code, href, guard ) { } # inline images - } else if ( match(line, /^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?/) ) { - len = RLENGTH; - text = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?/, "\\1", "g", substr(line, 1, len) ); - href = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?/, "\\2", "g", substr(line, 1, len) ); - title = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?/, "\\4", "g", substr(line, 1, len) ); - attrib = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?/, "\\6", "g", substr(line, 1, len) ); - if ( title && attrib ) { - return "\""" \ - inline( substr( line, len + 1) ); - } else if ( title ) { - return "\""" \ - inline( substr( line, len + 1) ); - } else if ( attrib ) { - return "\""" \ - inline( substr( line, len + 1) ); - } else { - return "\""" \ - inline( substr( line, len + 1) ); - } + } else if ( match(line, "^!" lix "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?") ) { + len = RLENGTH; text = href = title = attrib = substr( line, 1, len); + + sub("^!\\[", "", text); + sub("\\]\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?$", "", text); + + sub("^!" lix "\\([\\n\\t ]*", "", href); + sub("([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?$", "", href); + + sub("^!" lix "\\([\\n\\t ]*" lid, "", title); + sub("[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?$", "", title); + sub("^[\\n\\t ]+", "", title); + + sub("^!" lix "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)", "", attrib); + sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/\\/, "", href); gsub(/\\/, "", title); gsub(/[\n\t]+/, " ", title); + + return "\""" inline( substr( line, len + 1) ); # reference style images } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\]/ ) ) { @@ -290,16 +302,6 @@ function inline( line, LOCAL, len, text, code, href, guard ) { len = RLENGTH; return "" inline( substr( line, 2, len - 2 ) ) "" inline( substr( line, len + 1 ) ); - # Macros - } else if ( match( line, /^<<([^>]|>[^>])+>>/ ) ) { - len = RLENGTH; - return "" HTML( substr( line, 3, len - 4 ) ) "" inline(substr(line, len + 1)); - - # Verbatim inline HTML - } else if ( AllowHTML && match( line, /^(|<\?([^\?]|\?[^>])*\?>|]*>|])*\]\]>|<\/[A-Za-z][A-Za-z0-9-]*[[:space:]]*>|<[A-Za-z][A-Za-z0-9-]*([[:space:]]+[A-Za-z_:][A-Za-z0-9_\.:-]*([[:space:]]*=[[:space:]]*([[:space:]"'=<>`]+|"[^"]*"|'[^']*'))?)*[[:space:]]*\/?>)/) ) { - len = RLENGTH; - return substr( line, 1, len) inline(substr(line, len + 1)); - # Literal HTML entities } else if ( match( line, /^&([a-zA-Z]{2,32}|#[0-9]{1,7}|#[xX][0-9a-fA-F]{1,6});/) ) { len = RLENGTH; @@ -636,35 +638,36 @@ function _block( block, LOCAL, st, len, text, title, attrib, href, guard, code, return headline( n, text, 0 ) _block( substr( block, len + 1) ); # block images (wrapped in
) - } else if ( match(block, /^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?(\n|$)/) ) { - len = RLENGTH; - text = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?(\n.*)?$/, "\\1", "g", block); - href = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?(\n.*)?$/, "\\2", "g", block); - title = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?(\n.*)?$/, "\\4", "g", block); - attrib = gensub(/^!\[([^]]*)\]\(([^"\)]+)([ \t]+"([^"]+)")?\)(\{([a-zA-Z \t-]*)\})?(\n.*)?$/, "\\6", "g", block); - if ( title && attrib ) { - return "
" \ - "\""" \ - "
" inline(title) "
" \ - "
\n\n" \ - _block( substr( block, len + 1) ); - } else if ( title ) { - return "
" \ - "\""" \ - "
" inline(title) "
" \ - "
\n\n" \ - _block( substr( block, len + 1) ); - } else if ( attrib ) { - return "
" \ - "\""" \ - "
\n\n" \ - _block( substr( block, len + 1) ); - } else { - return "
" \ - "\""" \ - "
\n\n" \ - _block( substr( block, len + 1) ); - } + } else if ( match(block, "^!" lix "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?(\\n|$)") ) { + len = RLENGTH; text = href = title = attrib = substr( block, 1, len); + + sub("^!\\[", "", text); + sub("\\]\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?(\\n.*)?$", "", text); + + sub("^!" lix "\\([\\n\\t ]*", "", href); + sub("([\\n\\t ]+" lit ")?[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?(\\n.*)?$", "", href); + + sub("^!" lix "\\([\\n\\t ]*" lid, "", title); + sub("[\\n\\t ]*\\)(\\{[a-zA-Z \\t-]*\\})?(\\n.*)?$", "", title); + sub("^[\\n\\t ]+", "", title); + + sub("^!" lix "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\)", "", attrib); + sub("(\\n.*)?$", "", attrib); + sub(/^\{[ \t]*/, "", attrib); sub(/[ \t]*\}$/, "", attrib); gsub(/[ \t]+/, " ", attrib); + + if ( match(href, /^<.*>$/) ) { sub(/^$/, "", href); } + if ( match(title, /^".*"$/) ) { sub(/^"/, "", title); sub(/"$/, "", title); } + else if ( match(title, /^'.*'$/) ) { sub(/^'/, "", title); sub(/'$/, "", title); } + else if ( match(title, /^\(.*\)$/) ) { sub(/^\(/, "", title); sub(/\)$/, "", title); } + + gsub(/\\/, "", href); + + return "
" \ + "\""" \ + (title?"
" inline(title) "
":"") \ + "
\n\n" \ + _block( substr( block, len + 1) ); # reference style images (block) } else if ( match(line, /^!\[([^]]*)\] ?\[([^]]*)\](\n|$)/ ) ) { @@ -822,6 +825,20 @@ BEGIN { HL[1] = 0; HL[2] = 0; HL[3] = 0; HL[4] = 0; HL[5] = 0; HL[6] = 0; # hls = "0 0 0 0 0 0"; + # Universal Patterns + nu = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\_]|_[[:alnum:]])*" # not underline (except when escaped) + na = "(\\\\\\\\|\\\\[^\\\\]|[^\\\\\\*])*" # not asterisk (except when escaped) + ieu = "_([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])_" # inner (underline) + isu = "__([^_[:space:]]|[^_[:space:]]" nu "[^_[:space:]])__" # inner (underline) + iea = "\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*" # inner (asterisk) + isa = "\\*\\*([^\\*[:space:]]|[^\\*[:space:]]" na "[^\\*[:space:]])\\*\\*" # inner (asterisk) + + lix="\\[(\\\\[^\\n]|[^]\\n\\\\[])*\\]" # link text + lid="(<(\\\\[^\\n]|[^\\n<>\\\\])*>|([^<\\n\\t ()\\\\]|\\\\[^\\n])(\\\\[\\n]|[^\\n\\t ()\\\\])*)" # link dest + lit="(\"(\\\\.|[^\"\\\\])*\"|'(\\\\.|[^'\\\\])*'|\\((\\\\.|[^()\\\\])*\\))" # link text + # link text with image def + lii="\\[(\\\\[^\\n]|[^]\\n\\\\[])*(!" lix "\\([\\n\\t ]*" lid "([\\n\\t ]+" lit ")?[\\n\\t ]*\\))?(\\\\[^\\n]|[^]\\n\\\\[])*\\]" + # Buffering of full file ist necessary, e.g. to find reference links while (getline) { file = file $0 "\n"; } # Clean up MS-DOS line breaks -- 2.39.2