Markdown-1.0.2b7.pl 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642
  1. #!/usr/bin/env perl
  2. #
  3. # Markdown -- A text-to-HTML conversion tool for web writers
  4. #
  5. # Copyright (c) 2004-2005 John Gruber
  6. # <http://daringfireball.net/projects/markdown/>
  7. #
  8. package Markdown;
  9. require 5.006_000;
  10. use strict;
  11. use warnings;
  12. use Digest::MD5 qw(md5_hex);
  13. use vars qw($VERSION);
  14. $VERSION = '1.0.2b7';
  15. # Tue 29 Aug 2006
  16. ## Disabled; causes problems under Perl 5.6.1:
  17. # use utf8;
  18. # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  19. #
  20. # Global default settings:
  21. #
  22. my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
  23. my $g_tab_width = 4;
  24. #
  25. # Globals:
  26. #
  27. # Regex to match balanced [brackets]. See Friedl's
  28. # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  29. my $g_nested_brackets;
  30. $g_nested_brackets = qr{
  31. (?> # Atomic matching
  32. [^\[\]]+ # Anything other than brackets
  33. |
  34. \[
  35. (??{ $g_nested_brackets }) # Recursive set of nested brackets
  36. \]
  37. )*
  38. }x;
  39. # Table of hash values for escaped characters:
  40. my %g_escape_table;
  41. foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
  42. $g_escape_table{$char} = md5_hex($char);
  43. }
  44. # Global hashes, used by various utility routines
  45. my %g_urls;
  46. my %g_titles;
  47. my %g_html_blocks;
  48. # Used to track when we're inside an ordered or unordered list
  49. # (see _ProcessListItems() for details):
  50. my $g_list_level = 0;
  51. #### Blosxom plug-in interface ##########################################
  52. # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
  53. # which posts Markdown should process, using a "meta-markup: markdown"
  54. # header. If it's set to 0 (the default), Markdown will process all
  55. # entries.
  56. my $g_blosxom_use_meta = 0;
  57. sub start { 1; }
  58. sub story {
  59. my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  60. if ( (! $g_blosxom_use_meta) or
  61. (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  62. ){
  63. $$body_ref = Markdown($$body_ref);
  64. }
  65. 1;
  66. }
  67. #### Movable Type plug-in interface #####################################
  68. eval {require MT}; # Test to see if we're running in MT.
  69. unless ($@) {
  70. require MT;
  71. import MT;
  72. require MT::Template::Context;
  73. import MT::Template::Context;
  74. eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
  75. unless ($@) {
  76. require MT::Plugin;
  77. import MT::Plugin;
  78. my $plugin = new MT::Plugin({
  79. name => "Markdown",
  80. description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
  81. doc_link => 'http://daringfireball.net/projects/markdown/'
  82. });
  83. MT->add_plugin( $plugin );
  84. }
  85. MT::Template::Context->add_container_tag(MarkdownOptions => sub {
  86. my $ctx = shift;
  87. my $args = shift;
  88. my $builder = $ctx->stash('builder');
  89. my $tokens = $ctx->stash('tokens');
  90. if (defined ($args->{'output'}) ) {
  91. $ctx->stash('markdown_output', lc $args->{'output'});
  92. }
  93. defined (my $str = $builder->build($ctx, $tokens) )
  94. or return $ctx->error($builder->errstr);
  95. $str; # return value
  96. });
  97. MT->add_text_filter('markdown' => {
  98. label => 'Markdown',
  99. docs => 'http://daringfireball.net/projects/markdown/',
  100. on_format => sub {
  101. my $text = shift;
  102. my $ctx = shift;
  103. my $raw = 0;
  104. if (defined $ctx) {
  105. my $output = $ctx->stash('markdown_output');
  106. if (defined $output && $output =~ m/^html/i) {
  107. $g_empty_element_suffix = ">";
  108. $ctx->stash('markdown_output', '');
  109. }
  110. elsif (defined $output && $output eq 'raw') {
  111. $raw = 1;
  112. $ctx->stash('markdown_output', '');
  113. }
  114. else {
  115. $raw = 0;
  116. $g_empty_element_suffix = " />";
  117. }
  118. }
  119. $text = $raw ? $text : Markdown($text);
  120. $text;
  121. },
  122. });
  123. # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
  124. my $smartypants;
  125. {
  126. no warnings "once";
  127. $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
  128. }
  129. if ($smartypants) {
  130. MT->add_text_filter('markdown_with_smartypants' => {
  131. label => 'Markdown With SmartyPants',
  132. docs => 'http://daringfireball.net/projects/markdown/',
  133. on_format => sub {
  134. my $text = shift;
  135. my $ctx = shift;
  136. if (defined $ctx) {
  137. my $output = $ctx->stash('markdown_output');
  138. if (defined $output && $output eq 'html') {
  139. $g_empty_element_suffix = ">";
  140. }
  141. else {
  142. $g_empty_element_suffix = " />";
  143. }
  144. }
  145. $text = Markdown($text);
  146. $text = $smartypants->($text, '1');
  147. },
  148. });
  149. }
  150. }
  151. else {
  152. #### BBEdit/command-line text filter interface ##########################
  153. # Needs to be hidden from MT (and Blosxom when running in static mode).
  154. # We're only using $blosxom::version once; tell Perl not to warn us:
  155. no warnings 'once';
  156. unless ( defined($blosxom::version) ) {
  157. use warnings;
  158. #### Check for command-line switches: #################
  159. my %cli_opts;
  160. use Getopt::Long;
  161. Getopt::Long::Configure('pass_through');
  162. GetOptions(\%cli_opts,
  163. 'version',
  164. 'shortversion',
  165. 'html4tags',
  166. );
  167. if ($cli_opts{'version'}) { # Version info
  168. print "\nThis is Markdown, version $VERSION.\n";
  169. print "Copyright 2004 John Gruber\n";
  170. print "http://daringfireball.net/projects/markdown/\n\n";
  171. exit 0;
  172. }
  173. if ($cli_opts{'shortversion'}) { # Just the version number string.
  174. print $VERSION;
  175. exit 0;
  176. }
  177. if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
  178. $g_empty_element_suffix = ">";
  179. }
  180. #### Process incoming text: ###########################
  181. my $text;
  182. {
  183. local $/; # Slurp the whole file
  184. $text = <>;
  185. }
  186. print Markdown($text);
  187. }
  188. }
  189. sub Markdown {
  190. #
  191. # Main function. The order in which other subs are called here is
  192. # essential. Link and image substitutions need to happen before
  193. # _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the <a>
  194. # and <img> tags get encoded.
  195. #
  196. my $text = shift;
  197. # Clear the global hashes. If we don't clear these, you get conflicts
  198. # from other articles when generating a page which contains more than
  199. # one article (e.g. an index page that shows the N most recent
  200. # articles):
  201. %g_urls = ();
  202. %g_titles = ();
  203. %g_html_blocks = ();
  204. # Standardize line endings:
  205. $text =~ s{\r\n}{\n}g; # DOS to Unix
  206. $text =~ s{\r}{\n}g; # Mac to Unix
  207. # Make sure $text ends with a couple of newlines:
  208. $text .= "\n\n";
  209. # Convert all tabs to spaces.
  210. $text = _Detab($text);
  211. # Strip any lines consisting only of spaces and tabs.
  212. # This makes subsequent regexen easier to write, because we can
  213. # match consecutive blank lines with /\n+/ instead of something
  214. # contorted like /[ \t]*\n+/ .
  215. $text =~ s/^[ \t]+$//mg;
  216. # Turn block-level HTML blocks into hash entries
  217. $text = _HashHTMLBlocks($text);
  218. # Strip link definitions, store in hashes.
  219. $text = _StripLinkDefinitions($text);
  220. $text = _RunBlockGamut($text);
  221. $text = _UnescapeSpecialChars($text);
  222. return $text . "\n";
  223. }
  224. sub _StripLinkDefinitions {
  225. #
  226. # Strips link definitions from text, stores the URLs and titles in
  227. # hash references.
  228. #
  229. my $text = shift;
  230. my $less_than_tab = $g_tab_width - 1;
  231. # Link defs are in the form: ^[id]: url "optional title"
  232. while ($text =~ s{
  233. ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
  234. [ \t]*
  235. \n? # maybe *one* newline
  236. [ \t]*
  237. <?(\S+?)>? # url = $2
  238. [ \t]*
  239. \n? # maybe one newline
  240. [ \t]*
  241. (?:
  242. (?<=\s) # lookbehind for whitespace
  243. ["(]
  244. (.+?) # title = $3
  245. [")]
  246. [ \t]*
  247. )? # title is optional
  248. (?:\n+|\Z)
  249. }
  250. {}mx) {
  251. $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
  252. if ($3) {
  253. $g_titles{lc $1} = $3;
  254. $g_titles{lc $1} =~ s/"/&quot;/g;
  255. }
  256. }
  257. return $text;
  258. }
  259. sub _HashHTMLBlocks {
  260. my $text = shift;
  261. my $less_than_tab = $g_tab_width - 1;
  262. # Hashify HTML blocks:
  263. # We only want to do this for block-level HTML tags, such as headers,
  264. # lists, and tables. That's because we still want to wrap <p>s around
  265. # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
  266. # phrase emphasis, and spans. The list of tags we're looking for is
  267. # hard-coded:
  268. my $block_tags = qr{
  269. (?:
  270. p | div | h[1-6] | blockquote | pre | table |
  271. dl | ol | ul | script | noscript | form |
  272. fieldset | iframe | math | ins | del
  273. )
  274. }x;
  275. my $tag_attrs = qr{
  276. (?: # Match one attr name/value pair
  277. \s+ # There needs to be at least some whitespace
  278. # before each attribute name.
  279. [\w.:_-]+ # Attribute name
  280. \s*=\s*
  281. (["']) # Attribute quoter
  282. .+? # Attribute value
  283. \1 # Closing quoter
  284. )* # Zero or more
  285. }x;
  286. my $empty_tag = qr{< \w+ $tag_attrs \s* />}xms;
  287. my $open_tag = qr{< $block_tags $tag_attrs \s* >}xms;
  288. my $close_tag = undef; # let Text::Balanced handle this
  289. use Text::Balanced qw(gen_extract_tagged);
  290. my $extract_block = gen_extract_tagged($open_tag, $close_tag, undef, { ignore => [$empty_tag] });
  291. my @chunks;
  292. ## TO-DO: the 0,3 on the next line ought to respect the
  293. ## tabwidth, or else, we should mandate 4-space tabwidth and
  294. ## be done with it:
  295. while ($text =~ s{^(([ ]{0,3}<)?.*\n)}{}m) {
  296. my $cur_line = $1;
  297. if (defined $2) {
  298. # current line could be start of code block
  299. my ($tag, $remainder) = $extract_block->($cur_line . $text);
  300. if ($tag) {
  301. my $key = md5_hex($tag);
  302. $g_html_blocks{$key} = $tag;
  303. push @chunks, "\n\n" . $key . "\n\n";
  304. $text = $remainder;
  305. }
  306. else {
  307. # No tag match, so toss $cur_line into @chunks
  308. push @chunks, $cur_line;
  309. }
  310. }
  311. else {
  312. # current line could NOT be start of code block
  313. push @chunks, $cur_line;
  314. }
  315. }
  316. push @chunks, $text; # Whatever is left.
  317. $text = join '', @chunks;
  318. # Special case just for <hr />. It was easier to make a special case than
  319. # to make the other regex more complicated.
  320. $text =~ s{
  321. (?:
  322. (?<=\n\n) # Starting after a blank line
  323. | # or
  324. \A\n? # the beginning of the doc
  325. )
  326. ( # save in $1
  327. [ ]{0,$less_than_tab}
  328. <(hr) # start tag = $2
  329. \b # word break
  330. ([^<>])*? #
  331. /?> # the matching end tag
  332. [ \t]*
  333. (?=\n{2,}|\Z) # followed by a blank line or end of document
  334. )
  335. }{
  336. my $key = md5_hex($1);
  337. $g_html_blocks{$key} = $1;
  338. "\n\n" . $key . "\n\n";
  339. }egx;
  340. # Special case for standalone HTML comments:
  341. $text =~ s{
  342. (?:
  343. (?<=\n\n) # Starting after a blank line
  344. | # or
  345. \A\n? # the beginning of the doc
  346. )
  347. ( # save in $1
  348. [ ]{0,$less_than_tab}
  349. (?s:
  350. <!
  351. (--.*?--\s*)+
  352. >
  353. )
  354. [ \t]*
  355. (?=\n{2,}|\Z) # followed by a blank line or end of document
  356. )
  357. }{
  358. my $key = md5_hex($1);
  359. $g_html_blocks{$key} = $1;
  360. "\n\n" . $key . "\n\n";
  361. }egx;
  362. # PHP and ASP-style processor instructions (<?…?> and <%…%>)
  363. $text =~ s{
  364. (?:
  365. (?<=\n\n) # Starting after a blank line
  366. | # or
  367. \A\n? # the beginning of the doc
  368. )
  369. ( # save in $1
  370. [ ]{0,$less_than_tab}
  371. (?s:
  372. <([?%]) # $2
  373. .*?
  374. \2>
  375. )
  376. [ \t]*
  377. (?=\n{2,}|\Z) # followed by a blank line or end of document
  378. )
  379. }{
  380. my $key = md5_hex($1);
  381. $g_html_blocks{$key} = $1;
  382. "\n\n" . $key . "\n\n";
  383. }egx;
  384. return $text;
  385. }
  386. sub _RunBlockGamut {
  387. #
  388. # These are all the transformations that form block-level
  389. # tags like paragraphs, headers, and list items.
  390. #
  391. my $text = shift;
  392. $text = _DoHeaders($text);
  393. # Do Horizontal Rules:
  394. $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
  395. $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
  396. $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
  397. $text = _DoLists($text);
  398. $text = _DoCodeBlocks($text);
  399. $text = _DoBlockQuotes($text);
  400. # We already ran _HashHTMLBlocks() before, in Markdown(), but that
  401. # was to escape raw HTML in the original Markdown source. This time,
  402. # we're escaping the markup we've just created, so that we don't wrap
  403. # <p> tags around block-level tags.
  404. $text = _HashHTMLBlocks($text);
  405. $text = _FormParagraphs($text);
  406. return $text;
  407. }
  408. sub _RunSpanGamut {
  409. #
  410. # These are all the transformations that occur *within* block-level
  411. # tags like paragraphs, headers, and list items.
  412. #
  413. my $text = shift;
  414. $text = _DoCodeSpans($text);
  415. $text = _EscapeSpecialCharsWithinTagAttributes($text);
  416. $text = _EncodeBackslashEscapes($text);
  417. # Process anchor and image tags. Images must come first,
  418. # because ![foo][f] looks like an anchor.
  419. $text = _DoImages($text);
  420. $text = _DoAnchors($text);
  421. # Make links out of things like `<http://example.com/>`
  422. # Must come after _DoAnchors(), because you can use < and >
  423. # delimiters in inline links like [this](<url>).
  424. $text = _DoAutoLinks($text);
  425. $text = _EncodeAmpsAndAngles($text);
  426. $text = _DoItalicsAndBold($text);
  427. # Do hard breaks:
  428. $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
  429. return $text;
  430. }
  431. sub _EscapeSpecialCharsWithinTagAttributes {
  432. #
  433. # Within tags -- meaning between < and > -- encode [\ ` * _] so they
  434. # don't conflict with their use in Markdown for code, italics and strong.
  435. # We're replacing each such character with its corresponding MD5 checksum
  436. # value; this is likely overkill, but it should prevent us from colliding
  437. # with the escape values by accident.
  438. #
  439. my $text = shift;
  440. my $tokens ||= _TokenizeHTML($text);
  441. $text = ''; # rebuild $text from the tokens
  442. foreach my $cur_token (@$tokens) {
  443. if ($cur_token->[0] eq "tag") {
  444. $cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gx;
  445. $cur_token->[1] =~ s{ (?<=.)</?code>(?=.) }{$g_escape_table{'`'}}gx;
  446. $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
  447. $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
  448. }
  449. $text .= $cur_token->[1];
  450. }
  451. return $text;
  452. }
  453. sub _DoAnchors {
  454. #
  455. # Turn Markdown link shortcuts into XHTML <a> tags.
  456. #
  457. my $text = shift;
  458. #
  459. # First, handle reference-style links: [link text] [id]
  460. #
  461. $text =~ s{
  462. ( # wrap whole match in $1
  463. \[
  464. ($g_nested_brackets) # link text = $2
  465. \]
  466. [ ]? # one optional space
  467. (?:\n[ ]*)? # one optional newline followed by spaces
  468. \[
  469. (.*?) # id = $3
  470. \]
  471. )
  472. }{
  473. my $result;
  474. my $whole_match = $1;
  475. my $link_text = $2;
  476. my $link_id = lc $3;
  477. if ($link_id eq "") {
  478. $link_id = lc $link_text; # for shortcut links like [this][].
  479. }
  480. if (defined $g_urls{$link_id}) {
  481. my $url = $g_urls{$link_id};
  482. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  483. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  484. $result = "<a href=\"$url\"";
  485. if ( defined $g_titles{$link_id} ) {
  486. my $title = $g_titles{$link_id};
  487. $title =~ s! \* !$g_escape_table{'*'}!gx;
  488. $title =~ s! _ !$g_escape_table{'_'}!gx;
  489. $result .= " title=\"$title\"";
  490. }
  491. $result .= ">$link_text</a>";
  492. }
  493. else {
  494. $result = $whole_match;
  495. }
  496. $result;
  497. }xsge;
  498. #
  499. # Next, inline-style links: [link text](url "optional title")
  500. #
  501. $text =~ s{
  502. ( # wrap whole match in $1
  503. \[
  504. ($g_nested_brackets) # link text = $2
  505. \]
  506. \( # literal paren
  507. [ \t]*
  508. <?(.*?)>? # href = $3
  509. [ \t]*
  510. ( # $4
  511. (['"]) # quote char = $5
  512. (.*?) # Title = $6
  513. \5 # matching quote
  514. [ \t]* # ignore any spaces/tabs between closing quote and )
  515. )? # title is optional
  516. \)
  517. )
  518. }{
  519. my $result;
  520. my $whole_match = $1;
  521. my $link_text = $2;
  522. my $url = $3;
  523. my $title = $6;
  524. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  525. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  526. $result = "<a href=\"$url\"";
  527. if (defined $title) {
  528. $title =~ s/"/&quot;/g;
  529. $title =~ s! \* !$g_escape_table{'*'}!gx;
  530. $title =~ s! _ !$g_escape_table{'_'}!gx;
  531. $result .= " title=\"$title\"";
  532. }
  533. $result .= ">$link_text</a>";
  534. $result;
  535. }xsge;
  536. #
  537. # Last, handle reference-style shortcuts: [link text]
  538. # These must come last in case you've also got [link test][1]
  539. # or [link test](/foo)
  540. #
  541. $text =~ s{
  542. ( # wrap whole match in $1
  543. \[
  544. ([^\[\]]+) # link text = $2; can't contain '[' or ']'
  545. \]
  546. )
  547. }{
  548. my $result;
  549. my $whole_match = $1;
  550. my $link_text = $2;
  551. (my $link_id = lc $2) =~ s{[ ]?\n}{ }g; # lower-case and turn embedded newlines into spaces
  552. if (defined $g_urls{$link_id}) {
  553. my $url = $g_urls{$link_id};
  554. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  555. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  556. $result = "<a href=\"$url\"";
  557. if ( defined $g_titles{$link_id} ) {
  558. my $title = $g_titles{$link_id};
  559. $title =~ s! \* !$g_escape_table{'*'}!gx;
  560. $title =~ s! _ !$g_escape_table{'_'}!gx;
  561. $result .= " title=\"$title\"";
  562. }
  563. $result .= ">$link_text</a>";
  564. }
  565. else {
  566. $result = $whole_match;
  567. }
  568. $result;
  569. }xsge;
  570. return $text;
  571. }
  572. sub _DoImages {
  573. #
  574. # Turn Markdown image shortcuts into <img> tags.
  575. #
  576. my $text = shift;
  577. #
  578. # First, handle reference-style labeled images: ![alt text][id]
  579. #
  580. $text =~ s{
  581. ( # wrap whole match in $1
  582. !\[
  583. (.*?) # alt text = $2
  584. \]
  585. [ ]? # one optional space
  586. (?:\n[ ]*)? # one optional newline followed by spaces
  587. \[
  588. (.*?) # id = $3
  589. \]
  590. )
  591. }{
  592. my $result;
  593. my $whole_match = $1;
  594. my $alt_text = $2;
  595. my $link_id = lc $3;
  596. if ($link_id eq "") {
  597. $link_id = lc $alt_text; # for shortcut links like ![this][].
  598. }
  599. $alt_text =~ s/"/&quot;/g;
  600. if (defined $g_urls{$link_id}) {
  601. my $url = $g_urls{$link_id};
  602. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  603. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  604. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  605. if (defined $g_titles{$link_id}) {
  606. my $title = $g_titles{$link_id};
  607. $title =~ s! \* !$g_escape_table{'*'}!gx;
  608. $title =~ s! _ !$g_escape_table{'_'}!gx;
  609. $result .= " title=\"$title\"";
  610. }
  611. $result .= $g_empty_element_suffix;
  612. }
  613. else {
  614. # If there's no such link ID, leave intact:
  615. $result = $whole_match;
  616. }
  617. $result;
  618. }xsge;
  619. #
  620. # Next, handle inline images: ![alt text](url "optional title")
  621. # Don't forget: encode * and _
  622. $text =~ s{
  623. ( # wrap whole match in $1
  624. !\[
  625. (.*?) # alt text = $2
  626. \]
  627. \s? # One optional whitespace character
  628. \( # literal paren
  629. [ \t]*
  630. <?(\S+?)>? # src url = $3
  631. [ \t]*
  632. ( # $4
  633. (['"]) # quote char = $5
  634. (.*?) # title = $6
  635. \5 # matching quote
  636. [ \t]*
  637. )? # title is optional
  638. \)
  639. )
  640. }{
  641. my $result;
  642. my $whole_match = $1;
  643. my $alt_text = $2;
  644. my $url = $3;
  645. my $title = '';
  646. if (defined($6)) {
  647. $title = $6;
  648. }
  649. $alt_text =~ s/"/&quot;/g;
  650. $title =~ s/"/&quot;/g;
  651. $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
  652. $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
  653. $result = "<img src=\"$url\" alt=\"$alt_text\"";
  654. if (defined $title) {
  655. $title =~ s! \* !$g_escape_table{'*'}!gx;
  656. $title =~ s! _ !$g_escape_table{'_'}!gx;
  657. $result .= " title=\"$title\"";
  658. }
  659. $result .= $g_empty_element_suffix;
  660. $result;
  661. }xsge;
  662. return $text;
  663. }
  664. sub _DoHeaders {
  665. my $text = shift;
  666. # Setext-style headers:
  667. # Header 1
  668. # ========
  669. #
  670. # Header 2
  671. # --------
  672. #
  673. $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
  674. "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
  675. }egmx;
  676. $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
  677. "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
  678. }egmx;
  679. # atx-style headers:
  680. # # Header 1
  681. # ## Header 2
  682. # ## Header 2 with closing hashes ##
  683. # ...
  684. # ###### Header 6
  685. #
  686. $text =~ s{
  687. ^(\#{1,6}) # $1 = string of #'s
  688. [ \t]*
  689. (.+?) # $2 = Header text
  690. [ \t]*
  691. \#* # optional closing #'s (not counted)
  692. \n+
  693. }{
  694. my $h_level = length($1);
  695. "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
  696. }egmx;
  697. return $text;
  698. }
  699. sub _DoLists {
  700. #
  701. # Form HTML ordered (numbered) and unordered (bulleted) lists.
  702. #
  703. my $text = shift;
  704. my $less_than_tab = $g_tab_width - 1;
  705. # Re-usable patterns to match list item bullets and number markers:
  706. my $marker_ul = qr/[*+-]/;
  707. my $marker_ol = qr/\d+[.]/;
  708. my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
  709. # Re-usable pattern to match any entirel ul or ol list:
  710. my $whole_list = qr{
  711. ( # $1 = whole list
  712. ( # $2
  713. [ ]{0,$less_than_tab}
  714. (${marker_any}) # $3 = first list item marker
  715. [ \t]+
  716. )
  717. (?s:.+?)
  718. ( # $4
  719. \z
  720. |
  721. \n{2,}
  722. (?=\S)
  723. (?! # Negative lookahead for another list item marker
  724. [ \t]*
  725. ${marker_any}[ \t]+
  726. )
  727. )
  728. )
  729. }mx;
  730. # We use a different prefix before nested lists than top-level lists.
  731. # See extended comment in _ProcessListItems().
  732. #
  733. # Note: There's a bit of duplication here. My original implementation
  734. # created a scalar regex pattern as the conditional result of the test on
  735. # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
  736. # substitution once, using the scalar as the pattern. This worked,
  737. # everywhere except when running under MT on my hosting account at Pair
  738. # Networks. There, this caused all rebuilds to be killed by the reaper (or
  739. # perhaps they crashed, but that seems incredibly unlikely given that the
  740. # same script on the same server ran fine *except* under MT. I've spent
  741. # more time trying to figure out why this is happening than I'd like to
  742. # admit. My only guess, backed up by the fact that this workaround works,
  743. # is that Perl optimizes the substition when it can figure out that the
  744. # pattern will never change, and when this optimization isn't on, we run
  745. # afoul of the reaper. Thus, the slightly redundant code that uses two
  746. # static s/// patterns rather than one conditional pattern.
  747. if ($g_list_level) {
  748. $text =~ s{
  749. ^
  750. $whole_list
  751. }{
  752. my $list = $1;
  753. my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
  754. # Turn double returns into triple returns, so that we can make a
  755. # paragraph for the last item in a list, if necessary:
  756. $list =~ s/\n{2,}/\n\n\n/g;
  757. my $result = _ProcessListItems($list, $marker_any);
  758. # Trim any trailing whitespace, to put the closing `</$list_type>`
  759. # up on the preceding line, to get it past the current stupid
  760. # HTML block parser. This is a hack to work around the terrible
  761. # hack that is the HTML block parser.
  762. $result =~ s{\s+$}{};
  763. $result = "<$list_type>" . $result . "</$list_type>\n";
  764. $result;
  765. }egmx;
  766. }
  767. else {
  768. $text =~ s{
  769. (?:(?<=\n\n)|\A\n?)
  770. $whole_list
  771. }{
  772. my $list = $1;
  773. my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
  774. # Turn double returns into triple returns, so that we can make a
  775. # paragraph for the last item in a list, if necessary:
  776. $list =~ s/\n{2,}/\n\n\n/g;
  777. my $result = _ProcessListItems($list, $marker_any);
  778. $result = "<$list_type>\n" . $result . "</$list_type>\n";
  779. $result;
  780. }egmx;
  781. }
  782. return $text;
  783. }
  784. sub _ProcessListItems {
  785. #
  786. # Process the contents of a single ordered or unordered list, splitting it
  787. # into individual list items.
  788. #
  789. my $list_str = shift;
  790. my $marker_any = shift;
  791. # The $g_list_level global keeps track of when we're inside a list.
  792. # Each time we enter a list, we increment it; when we leave a list,
  793. # we decrement. If it's zero, we're not in a list anymore.
  794. #
  795. # We do this because when we're not inside a list, we want to treat
  796. # something like this:
  797. #
  798. # I recommend upgrading to version
  799. # 8. Oops, now this line is treated
  800. # as a sub-list.
  801. #
  802. # As a single paragraph, despite the fact that the second line starts
  803. # with a digit-period-space sequence.
  804. #
  805. # Whereas when we're inside a list (or sub-list), that line will be
  806. # treated as the start of a sub-list. What a kludge, huh? This is
  807. # an aspect of Markdown's syntax that's hard to parse perfectly
  808. # without resorting to mind-reading. Perhaps the solution is to
  809. # change the syntax rules such that sub-lists must start with a
  810. # starting cardinal number; e.g. "1." or "a.".
  811. $g_list_level++;
  812. # trim trailing blank lines:
  813. $list_str =~ s/\n{2,}\z/\n/;
  814. $list_str =~ s{
  815. (\n)? # leading line = $1
  816. (^[ \t]*) # leading whitespace = $2
  817. ($marker_any) [ \t]+ # list marker = $3
  818. ((?s:.+?) # list item text = $4
  819. (\n{1,2}))
  820. (?= \n* (\z | \2 ($marker_any) [ \t]+))
  821. }{
  822. my $item = $4;
  823. my $leading_line = $1;
  824. my $leading_space = $2;
  825. if ($leading_line or ($item =~ m/\n{2,}/)) {
  826. $item = _RunBlockGamut(_Outdent($item));
  827. }
  828. else {
  829. # Recursion for sub-lists:
  830. $item = _DoLists(_Outdent($item));
  831. chomp $item;
  832. $item = _RunSpanGamut($item);
  833. }
  834. "<li>" . $item . "</li>\n";
  835. }egmx;
  836. $g_list_level--;
  837. return $list_str;
  838. }
  839. sub _DoCodeBlocks {
  840. #
  841. # Process Markdown `<pre><code>` blocks.
  842. #
  843. my $text = shift;
  844. $text =~ s{
  845. (?:\n\n|\A)
  846. ( # $1 = the code block -- one or more lines, starting with a space/tab
  847. (?:
  848. (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
  849. .*\n+
  850. )+
  851. )
  852. ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
  853. }{
  854. my $codeblock = $1;
  855. my $result; # return value
  856. $codeblock = _EncodeCode(_Outdent($codeblock));
  857. $codeblock = _Detab($codeblock);
  858. $codeblock =~ s/\A\n+//; # trim leading newlines
  859. $codeblock =~ s/\n+\z//; # trim trailing newlines
  860. $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
  861. $result;
  862. }egmx;
  863. return $text;
  864. }
  865. sub _DoCodeSpans {
  866. #
  867. # * Backtick quotes are used for <code></code> spans.
  868. #
  869. # * You can use multiple backticks as the delimiters if you want to
  870. # include literal backticks in the code span. So, this input:
  871. #
  872. # Just type ``foo `bar` baz`` at the prompt.
  873. #
  874. # Will translate to:
  875. #
  876. # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
  877. #
  878. # There's no arbitrary limit to the number of backticks you
  879. # can use as delimters. If you need three consecutive backticks
  880. # in your code, use four for delimiters, etc.
  881. #
  882. # * You can use spaces to get literal backticks at the edges:
  883. #
  884. # ... type `` `bar` `` ...
  885. #
  886. # Turns to:
  887. #
  888. # ... type <code>`bar`</code> ...
  889. #
  890. my $text = shift;
  891. $text =~ s@
  892. (?<!\\) # Character before opening ` can't be a backslash
  893. (`+) # $1 = Opening run of `
  894. (.+?) # $2 = The code block
  895. (?<!`)
  896. \1 # Matching closer
  897. (?!`)
  898. @
  899. my $c = "$2";
  900. $c =~ s/^[ \t]*//g; # leading whitespace
  901. $c =~ s/[ \t]*$//g; # trailing whitespace
  902. $c = _EncodeCode($c);
  903. "<code>$c</code>";
  904. @egsx;
  905. return $text;
  906. }
  907. sub _EncodeCode {
  908. #
  909. # Encode/escape certain characters inside Markdown code runs.
  910. # The point is that in code, these characters are literals,
  911. # and lose their special Markdown meanings.
  912. #
  913. local $_ = shift;
  914. # Encode all ampersands; HTML entities are not
  915. # entities within a Markdown code span.
  916. s/&/&amp;/g;
  917. # Encode $'s, but only if we're running under Blosxom.
  918. # (Blosxom interpolates Perl variables in article bodies.)
  919. {
  920. no warnings 'once';
  921. if (defined($blosxom::version)) {
  922. s/\$/&#036;/g;
  923. }
  924. }
  925. # Do the angle bracket song and dance:
  926. s! < !&lt;!gx;
  927. s! > !&gt;!gx;
  928. # Now, escape characters that are magic in Markdown:
  929. s! \* !$g_escape_table{'*'}!gx;
  930. s! _ !$g_escape_table{'_'}!gx;
  931. s! { !$g_escape_table{'{'}!gx;
  932. s! } !$g_escape_table{'}'}!gx;
  933. s! \[ !$g_escape_table{'['}!gx;
  934. s! \] !$g_escape_table{']'}!gx;
  935. s! \\ !$g_escape_table{'\\'}!gx;
  936. return $_;
  937. }
  938. sub _DoItalicsAndBold {
  939. my $text = shift;
  940. # <strong> must go first:
  941. $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
  942. {<strong>$2</strong>}gsx;
  943. $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
  944. {<em>$2</em>}gsx;
  945. return $text;
  946. }
  947. sub _DoBlockQuotes {
  948. my $text = shift;
  949. $text =~ s{
  950. ( # Wrap whole match in $1
  951. (
  952. ^[ \t]*>[ \t]? # '>' at the start of a line
  953. .+\n # rest of the first line
  954. (.+\n)* # subsequent consecutive lines
  955. \n* # blanks
  956. )+
  957. )
  958. }{
  959. my $bq = $1;
  960. $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
  961. $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
  962. $bq = _RunBlockGamut($bq); # recurse
  963. $bq =~ s/^/ /g;
  964. # These leading spaces screw with <pre> content, so we need to fix that:
  965. $bq =~ s{
  966. (\s*<pre>.+?</pre>)
  967. }{
  968. my $pre = $1;
  969. $pre =~ s/^ //mg;
  970. $pre;
  971. }egsx;
  972. "<blockquote>\n$bq\n</blockquote>\n\n";
  973. }egmx;
  974. return $text;
  975. }
  976. sub _FormParagraphs {
  977. #
  978. # Params:
  979. # $text - string to process with html <p> tags
  980. #
  981. my $text = shift;
  982. # Strip leading and trailing lines:
  983. $text =~ s/\A\n+//;
  984. $text =~ s/\n+\z//;
  985. my @grafs = split(/\n{2,}/, $text);
  986. #
  987. # Wrap <p> tags.
  988. #
  989. foreach (@grafs) {
  990. unless (defined( $g_html_blocks{$_} )) {
  991. $_ = _RunSpanGamut($_);
  992. s/^([ \t]*)/<p>/;
  993. $_ .= "</p>";
  994. }
  995. }
  996. #
  997. # Unhashify HTML blocks
  998. #
  999. # foreach my $graf (@grafs) {
  1000. # my $block = $g_html_blocks{$graf};
  1001. # if (defined $block) {
  1002. # $graf = $block;
  1003. # }
  1004. # }
  1005. foreach my $graf (@grafs) {
  1006. # Modify elements of @grafs in-place...
  1007. my $block = $g_html_blocks{$graf};
  1008. if (defined $block) {
  1009. $graf = $block;
  1010. if ($block =~ m{
  1011. \A
  1012. ( # $1 = <div> tag
  1013. <div \s+
  1014. [^>]*
  1015. \b
  1016. markdown\s*=\s* (['"]) # $2 = attr quote char
  1017. 1
  1018. \2
  1019. [^>]*
  1020. >
  1021. )
  1022. ( # $3 = contents
  1023. .*
  1024. )
  1025. (</div>) # $4 = closing tag
  1026. \z
  1027. }xms
  1028. ) {
  1029. my ($div_open, $div_content, $div_close) = ($1, $3, $4);
  1030. # We can't call Markdown(), because that resets the hash;
  1031. # that initialization code should be pulled into its own sub, though.
  1032. $div_content = _HashHTMLBlocks($div_content);
  1033. $div_content = _StripLinkDefinitions($div_content);
  1034. $div_content = _RunBlockGamut($div_content);
  1035. $div_content = _UnescapeSpecialChars($div_content);
  1036. $div_open =~ s{\smarkdown\s*=\s*(['"]).+?\1}{}ms;
  1037. $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
  1038. }
  1039. }
  1040. }
  1041. return join "\n\n", @grafs;
  1042. }
  1043. sub _EncodeAmpsAndAngles {
  1044. # Smart processing for ampersands and angle brackets that need to be encoded.
  1045. my $text = shift;
  1046. # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
  1047. # http://bumppo.net/projects/amputator/
  1048. $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
  1049. # Encode naked <'s
  1050. $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
  1051. return $text;
  1052. }
  1053. sub _EncodeBackslashEscapes {
  1054. #
  1055. # Parameter: String.
  1056. # Returns: The string, with after processing the following backslash
  1057. # escape sequences.
  1058. #
  1059. local $_ = shift;
  1060. s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
  1061. s! \\` !$g_escape_table{'`'}!gx;
  1062. s! \\\* !$g_escape_table{'*'}!gx;
  1063. s! \\_ !$g_escape_table{'_'}!gx;
  1064. s! \\\{ !$g_escape_table{'{'}!gx;
  1065. s! \\\} !$g_escape_table{'}'}!gx;
  1066. s! \\\[ !$g_escape_table{'['}!gx;
  1067. s! \\\] !$g_escape_table{']'}!gx;
  1068. s! \\\( !$g_escape_table{'('}!gx;
  1069. s! \\\) !$g_escape_table{')'}!gx;
  1070. s! \\> !$g_escape_table{'>'}!gx;
  1071. s! \\\# !$g_escape_table{'#'}!gx;
  1072. s! \\\+ !$g_escape_table{'+'}!gx;
  1073. s! \\\- !$g_escape_table{'-'}!gx;
  1074. s! \\\. !$g_escape_table{'.'}!gx;
  1075. s{ \\! }{$g_escape_table{'!'}}gx;
  1076. return $_;
  1077. }
  1078. sub _DoAutoLinks {
  1079. my $text = shift;
  1080. $text =~ s{<((https?|ftp|dict):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
  1081. # Email addresses: <address@domain.foo>
  1082. $text =~ s{
  1083. <
  1084. (?:mailto:)?
  1085. (
  1086. [-.\w]+
  1087. \@
  1088. [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
  1089. )
  1090. >
  1091. }{
  1092. _EncodeEmailAddress( _UnescapeSpecialChars($1) );
  1093. }egix;
  1094. return $text;
  1095. }
  1096. sub _EncodeEmailAddress {
  1097. #
  1098. # Input: an email address, e.g. "foo@example.com"
  1099. #
  1100. # Output: the email address as a mailto link, with each character
  1101. # of the address encoded as either a decimal or hex entity, in
  1102. # the hopes of foiling most address harvesting spam bots. E.g.:
  1103. #
  1104. # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
  1105. # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
  1106. # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
  1107. #
  1108. # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
  1109. # mailing list: <http://tinyurl.com/yu7ue>
  1110. #
  1111. my $addr = shift;
  1112. srand;
  1113. my @encode = (
  1114. sub { '&#' . ord(shift) . ';' },
  1115. sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
  1116. sub { shift },
  1117. );
  1118. $addr = "mailto:" . $addr;
  1119. $addr =~ s{(.)}{
  1120. my $char = $1;
  1121. if ( $char eq '@' ) {
  1122. # this *must* be encoded. I insist.
  1123. $char = $encode[int rand 1]->($char);
  1124. } elsif ( $char ne ':' ) {
  1125. # leave ':' alone (to spot mailto: later)
  1126. my $r = rand;
  1127. # roughly 10% raw, 45% hex, 45% dec
  1128. $char = (
  1129. $r > .9 ? $encode[2]->($char) :
  1130. $r < .45 ? $encode[1]->($char) :
  1131. $encode[0]->($char)
  1132. );
  1133. }
  1134. $char;
  1135. }gex;
  1136. $addr = qq{<a href="$addr">$addr</a>};
  1137. $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
  1138. return $addr;
  1139. }
  1140. sub _UnescapeSpecialChars {
  1141. #
  1142. # Swap back in all the special characters we've hidden.
  1143. #
  1144. my $text = shift;
  1145. while( my($char, $hash) = each(%g_escape_table) ) {
  1146. $text =~ s/$hash/$char/g;
  1147. }
  1148. return $text;
  1149. }
  1150. sub _TokenizeHTML {
  1151. #
  1152. # Parameter: String containing HTML markup.
  1153. # Returns: Reference to an array of the tokens comprising the input
  1154. # string. Each token is either a tag (possibly with nested,
  1155. # tags contained therein, such as <a href="<MTFoo>">, or a
  1156. # run of text between tags. Each element of the array is a
  1157. # two-element array; the first is either 'tag' or 'text';
  1158. # the second is the actual value.
  1159. #
  1160. #
  1161. # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
  1162. # <http://www.bradchoate.com/past/mtregex.php>
  1163. #
  1164. my $str = shift;
  1165. my $pos = 0;
  1166. my $len = length $str;
  1167. my @tokens;
  1168. my $depth = 6;
  1169. my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
  1170. my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
  1171. (?s: <\? .*? \?> ) | # processing instruction
  1172. $nested_tags/ix; # nested tags
  1173. while ($str =~ m/($match)/g) {
  1174. my $whole_tag = $1;
  1175. my $sec_start = pos $str;
  1176. my $tag_start = $sec_start - length $whole_tag;
  1177. if ($pos < $tag_start) {
  1178. push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
  1179. }
  1180. push @tokens, ['tag', $whole_tag];
  1181. $pos = pos $str;
  1182. }
  1183. push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
  1184. return \@tokens;
  1185. }
  1186. sub _Outdent {
  1187. #
  1188. # Remove one level of line-leading tabs or spaces
  1189. #
  1190. my $text = shift;
  1191. $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
  1192. return $text;
  1193. }
  1194. sub _Detab {
  1195. #
  1196. # Cribbed from a post by Bart Lateur:
  1197. # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
  1198. #
  1199. my $text = shift;
  1200. $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
  1201. return $text;
  1202. }
  1203. 1;
  1204. __END__
  1205. =pod
  1206. =head1 NAME
  1207. B<Markdown>
  1208. =head1 SYNOPSIS
  1209. B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
  1210. [ I<file> ... ]
  1211. =head1 DESCRIPTION
  1212. Markdown is a text-to-HTML filter; it translates an easy-to-read /
  1213. easy-to-write structured text format into HTML. Markdown's text format
  1214. is most similar to that of plain text email, and supports features such
  1215. as headers, *emphasis*, code blocks, blockquotes, and links.
  1216. Markdown's syntax is designed not as a generic markup language, but
  1217. specifically to serve as a front-end to (X)HTML. You can use span-level
  1218. HTML tags anywhere in a Markdown document, and you can use block level
  1219. HTML tags (like <div> and <table> as well).
  1220. For more information about Markdown's syntax, see:
  1221. http://daringfireball.net/projects/markdown/
  1222. =head1 OPTIONS
  1223. Use "--" to end switch parsing. For example, to open a file named "-z", use:
  1224. Markdown.pl -- -z
  1225. =over 4
  1226. =item B<--html4tags>
  1227. Use HTML 4 style for empty element tags, e.g.:
  1228. <br>
  1229. instead of Markdown's default XHTML style tags, e.g.:
  1230. <br />
  1231. =item B<-v>, B<--version>
  1232. Display Markdown's version number and copyright information.
  1233. =item B<-s>, B<--shortversion>
  1234. Display the short-form version number.
  1235. =back
  1236. =head1 BUGS
  1237. To file bug reports or feature requests (other than topics listed in the
  1238. Caveats section above) please send email to:
  1239. support@daringfireball.net
  1240. Please include with your report: (1) the example input; (2) the output
  1241. you expected; (3) the output Markdown actually produced.
  1242. =head1 VERSION HISTORY
  1243. See the readme file for detailed release notes for this version.
  1244. 1.0.2b7
  1245. + Changed shebang line from "/usr/bin/perl" to "/usr/bin/env perl"
  1246. + Now only trim trailing newlines from code blocks, instead of trimming
  1247. all trailing whitespace characters.
  1248. 1.0.2b6 - Mon 03 Apr 2006
  1249. + Fixed bad performance bug in new `Text::Balanced`-based block-level parser.
  1250. 1.0.2b5 - Thu 08 Dec 2005
  1251. + Fixed bug where this:
  1252. [text](http://m.com "title" )
  1253. wasn't working as expected, because the parser wasn't allowing for spaces
  1254. before the closing paren.
  1255. 1.0.2b4 - Thu 08 Sep 2005
  1256. + Filthy hack to support markdown='1' in div tags, because I need it
  1257. to write today's fireball.
  1258. + First crack at a new, smarter, block-level HTML parser.
  1259. 1.0.2b3 - Thu 28 Apr 2005
  1260. + _DoAutoLinks() now supports the 'dict://' URL scheme.
  1261. + PHP- and ASP-style processor instructions are now protected as
  1262. raw HTML blocks.
  1263. <? ... ?>
  1264. <% ... %>
  1265. + Workarounds for regressions introduced with fix for "backticks within
  1266. tags" bug in 1.0.2b1. The fix is to allow `...` to be turned into
  1267. <code>...</code> within an HTML tag attribute, and then to turn
  1268. these spurious `<code>` tags back into literal backtick characters
  1269. in _EscapeSpecialCharsWithinTagAttributes().
  1270. The regression was caused because in the fix, we moved
  1271. _EscapeSpecialCharsWithinTagAttributes() ahead of _DoCodeSpans()
  1272. in _RunSpanGamut(), but that's no good. We need to process code
  1273. spans first, otherwise we can get tripped up by something like this:
  1274. `<test a="` content of attribute `">`
  1275. 1.0.2b2 - 20 Mar 2005
  1276. + Fix for nested sub-lists in list-paragraph mode. Previously we got
  1277. a spurious extra level of `<p>` tags for something like this:
  1278. * this
  1279. * sub
  1280. that
  1281. + Experimental support for [this] as a synonym for [this][].
  1282. (Note to self: No test yet for this.)
  1283. Be sure to test, e.g.: [permutations of this sort of [thing][].]
  1284. 1.0.2b1 - 28 Feb 2005
  1285. + Fix for backticks within HTML tag: <span attr='`ticks`'>like this</span>
  1286. + Fix for escaped backticks still triggering code spans:
  1287. There are two raw backticks here: \` and here: \`, not a code span
  1288. 1.0.1 - 14 Dec 2004
  1289. 1.0 - 28 Aug 2004
  1290. =head1 AUTHOR
  1291. John Gruber
  1292. http://daringfireball.net
  1293. PHP port and other contributions by Michel Fortin
  1294. http://michelf.com
  1295. =head1 COPYRIGHT AND LICENSE
  1296. Copyright (c) 2003-2005 John Gruber
  1297. <http://daringfireball.net/>
  1298. All rights reserved.
  1299. Redistribution and use in source and binary forms, with or without
  1300. modification, are permitted provided that the following conditions are
  1301. met:
  1302. * Redistributions of source code must retain the above copyright notice,
  1303. this list of conditions and the following disclaimer.
  1304. * Redistributions in binary form must reproduce the above copyright
  1305. notice, this list of conditions and the following disclaimer in the
  1306. documentation and/or other materials provided with the distribution.
  1307. * Neither the name "Markdown" nor the names of its contributors may
  1308. be used to endorse or promote products derived from this software
  1309. without specific prior written permission.
  1310. This software is provided by the copyright holders and contributors "as
  1311. is" and any express or implied warranties, including, but not limited
  1312. to, the implied warranties of merchantability and fitness for a
  1313. particular purpose are disclaimed. In no event shall the copyright owner
  1314. or contributors be liable for any direct, indirect, incidental, special,
  1315. exemplary, or consequential damages (including, but not limited to,
  1316. procurement of substitute goods or services; loss of use, data, or
  1317. profits; or business interruption) however caused and on any theory of
  1318. liability, whether in contract, strict liability, or tort (including
  1319. negligence or otherwise) arising in any way out of the use of this
  1320. software, even if advised of the possibility of such damage.
  1321. =cut