浏览代码

fix(HTML parsing): fix HTML parsing issues with nested tags

Deeply nested HTML tags and recursive tags broke the HTML parser.

Closes #357, closes #387
Estevao Soares dos Santos 8 年之前
父节点
当前提交
6fbc072c2c

+ 51 - 3
dist/showdown.js

@@ -1,4 +1,4 @@
-;/*! showdown 25-04-2017 */
+;/*! showdown 28-05-2017 */
 (function(){
 /**
  * Created by Tivie on 13-07-2015.
@@ -806,6 +806,43 @@ showdown.helper.replaceRecursiveRegExp = function (str, replacement, left, right
   return finalStr;
 };
 
+/**
+ * Returns the index within the passed String object of the first occurrence of the specified regex,
+ * starting the search at fromIndex. Returns -1 if the value is not found.
+ *
+ * @param {string} str string to search
+ * @param {RegExp} regex Regular expression to search
+ * @param {int} [fromIndex = 0] Index to start the search
+ * @returns {Number}
+ * @throws InvalidArgumentError
+ */
+showdown.helper.regexIndexOf = function (str, regex, fromIndex) {
+  'use strict';
+  if (!showdown.helper.isString(str)) {
+    throw 'InvalidArgumentError: first parameter of showdown.helper.regexIndexOf function must be a string';
+  }
+  if (regex instanceof RegExp === false) {
+    throw 'InvalidArgumentError: second parameter of showdown.helper.regexIndexOf function must be an instance of RegExp';
+  }
+  var indexOf = str.substring(fromIndex || 0).search(regex);
+  return (indexOf >= 0) ? (indexOf + (fromIndex || 0)) : indexOf;
+};
+
+/**
+ * Splits the passed string object at the defined index, and returns an array composed of the two substrings
+ * @param {string} str string to split
+ * @param {int} index index to split string at
+ * @returns {[string,string]}
+ * @throws InvalidArgumentError
+ */
+showdown.helper.splitAtIndex = function (str, index) {
+  'use strict';
+  if (!showdown.helper.isString(str)) {
+    throw 'InvalidArgumentError: first parameter of showdown.helper.regexIndexOf function must be a string';
+  }
+  return [str.substring(0, index), str.substring(index)];
+};
+
 /**
  * Obfuscate an e-mail address through the use of Character Entities,
  * transforming ASCII characters into their equivalent decimal or hex entities.
@@ -1905,9 +1942,20 @@ showdown.subParser('hashHTMLBlocks', function (text, options, globals) {
       };
 
   for (var i = 0; i < blockTags.length; ++i) {
-    text = showdown.helper.replaceRecursiveRegExp(text, repFunc, '^ {0,3}<' + blockTags[i] + '\\b[^>]*>', '</' + blockTags[i] + '>', 'gim');
-  }
 
+    var opTagPos,
+        rgx1     = new RegExp('^ {0,3}<' + blockTags[i] + '\\b[^>]*>', 'im'),
+        patLeft  = '<' + blockTags[i] + '\\b[^>]*>',
+        patRight = '</' + blockTags[i] + '>';
+    // 1. Look for the first position of the first opening HTML tag in the text
+    while ((opTagPos = showdown.helper.regexIndexOf(text, rgx1)) !== -1) {
+      //2. Split the text in that position
+      var subTexts = showdown.helper.splitAtIndex(text, opTagPos);
+      //3. Match recursively
+      subTexts[1] = showdown.helper.replaceRecursiveRegExp(subTexts[1], repFunc, patLeft, patRight, 'im');
+      text = subTexts[0].concat(subTexts[1]);
+    }
+  }
   // HR SPECIAL CASE
   text = text.replace(/(\n {0,3}(<(hr)\b([^<>])*?\/?>)[ \t]*(?=\n{2,}))/g,
     showdown.subParser('hashElement')(text, options, globals));

文件差异内容过多而无法显示
+ 0 - 0
dist/showdown.js.map


文件差异内容过多而无法显示
+ 1 - 1
dist/showdown.min.js


文件差异内容过多而无法显示
+ 0 - 0
dist/showdown.min.js.map


+ 37 - 0
src/helpers.js

@@ -273,6 +273,43 @@ showdown.helper.replaceRecursiveRegExp = function (str, replacement, left, right
   return finalStr;
 };
 
+/**
+ * Returns the index within the passed String object of the first occurrence of the specified regex,
+ * starting the search at fromIndex. Returns -1 if the value is not found.
+ *
+ * @param {string} str string to search
+ * @param {RegExp} regex Regular expression to search
+ * @param {int} [fromIndex = 0] Index to start the search
+ * @returns {Number}
+ * @throws InvalidArgumentError
+ */
+showdown.helper.regexIndexOf = function (str, regex, fromIndex) {
+  'use strict';
+  if (!showdown.helper.isString(str)) {
+    throw 'InvalidArgumentError: first parameter of showdown.helper.regexIndexOf function must be a string';
+  }
+  if (regex instanceof RegExp === false) {
+    throw 'InvalidArgumentError: second parameter of showdown.helper.regexIndexOf function must be an instance of RegExp';
+  }
+  var indexOf = str.substring(fromIndex || 0).search(regex);
+  return (indexOf >= 0) ? (indexOf + (fromIndex || 0)) : indexOf;
+};
+
+/**
+ * Splits the passed string object at the defined index, and returns an array composed of the two substrings
+ * @param {string} str string to split
+ * @param {int} index index to split string at
+ * @returns {[string,string]}
+ * @throws InvalidArgumentError
+ */
+showdown.helper.splitAtIndex = function (str, index) {
+  'use strict';
+  if (!showdown.helper.isString(str)) {
+    throw 'InvalidArgumentError: first parameter of showdown.helper.regexIndexOf function must be a string';
+  }
+  return [str.substring(0, index), str.substring(index)];
+};
+
 /**
  * Obfuscate an e-mail address through the use of Character Entities,
  * transforming ASCII characters into their equivalent decimal or hex entities.

+ 13 - 2
src/subParsers/hashHTMLBlocks.js

@@ -49,9 +49,20 @@ showdown.subParser('hashHTMLBlocks', function (text, options, globals) {
       };
 
   for (var i = 0; i < blockTags.length; ++i) {
-    text = showdown.helper.replaceRecursiveRegExp(text, repFunc, '^ {0,3}<' + blockTags[i] + '\\b[^>]*>', '</' + blockTags[i] + '>', 'gim');
-  }
 
+    var opTagPos,
+        rgx1     = new RegExp('^ {0,3}<' + blockTags[i] + '\\b[^>]*>', 'im'),
+        patLeft  = '<' + blockTags[i] + '\\b[^>]*>',
+        patRight = '</' + blockTags[i] + '>';
+    // 1. Look for the first position of the first opening HTML tag in the text
+    while ((opTagPos = showdown.helper.regexIndexOf(text, rgx1)) !== -1) {
+      //2. Split the text in that position
+      var subTexts = showdown.helper.splitAtIndex(text, opTagPos);
+      //3. Match recursively
+      subTexts[1] = showdown.helper.replaceRecursiveRegExp(subTexts[1], repFunc, patLeft, patRight, 'im');
+      text = subTexts[0].concat(subTexts[1]);
+    }
+  }
   // HR SPECIAL CASE
   text = text.replace(/(\n {0,3}(<(hr)\b([^<>])*?\/?>)[ \t]*(?=\n{2,}))/g,
     showdown.subParser('hashElement')(text, options, globals));

+ 12 - 0
test/issues/deeply-nested-HTML-blocks.html

@@ -0,0 +1,12 @@
+<div>
+  <div>
+    <div>
+      <div>
+        text
+      </div>
+      <div>
+        text
+      </div>
+    </div>
+  </div>
+</div>

+ 12 - 0
test/issues/deeply-nested-HTML-blocks.md

@@ -0,0 +1,12 @@
+<div>
+  <div>
+    <div>
+      <div>
+        text
+      </div>
+      <div>
+        text
+      </div>
+    </div>
+  </div>
+</div>

+ 3 - 0
test/issues/one-line-HTML-input.html

@@ -0,0 +1,3 @@
+<div><div>a</div><div>b</div></div>
+<pre><code>&lt;div&gt;**foobar**&lt;/div&gt;
+</code></pre>

+ 3 - 0
test/issues/one-line-HTML-input.md

@@ -0,0 +1,3 @@
+<div><div>a</div><div>b</div></div>
+
+    <div>**foobar**</div>

+ 13 - 0
test/node/showdown.helpers.js

@@ -233,3 +233,16 @@ describe('forEach()', function () {
     });
   });
 });
+
+describe('matchRecursiveRegExp()', function () {
+  'use strict';
+
+  var rRegExp = showdown.helper.matchRecursiveRegExp;
+
+  it('should match nested elements', function () {
+    var result = rRegExp('<div><div>a</div></div>', '<div\\b[^>]*>', '</div>', 'gim');
+    result.should.deep.equal([['<div><div>a</div></div>', '<div>a</div>', '<div>', '</div>']]);
+  });
+
+});
+

部分文件因为文件数量过多而无法显示