htmlprettyprinter.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. // Copyright 2008 The Closure Library Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS-IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /**
  15. * @fileoverview Provides functions to parse and pretty-print HTML strings.
  16. *
  17. */
  18. goog.provide('goog.format.HtmlPrettyPrinter');
  19. goog.provide('goog.format.HtmlPrettyPrinter.Buffer');
  20. goog.require('goog.dom.TagName');
  21. goog.require('goog.object');
  22. goog.require('goog.string.StringBuffer');
  23. /**
  24. * This class formats HTML to be more human-readable.
  25. * TODO(user): Add hierarchical indentation.
  26. * @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If
  27. * this time is exceeded, return partially formatted. 0 or negative number
  28. * indicates no timeout.
  29. * @constructor
  30. * @final
  31. */
  32. goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {
  33. /**
  34. * Max # milliseconds to spend on #format.
  35. * @type {number}
  36. * @private
  37. */
  38. this.timeOutMillis_ =
  39. opt_timeOutMillis && opt_timeOutMillis > 0 ? opt_timeOutMillis : 0;
  40. };
  41. /**
  42. * Singleton.
  43. * @private {goog.format.HtmlPrettyPrinter?}
  44. */
  45. goog.format.HtmlPrettyPrinter.instance_ = null;
  46. /**
  47. * Singleton lazy initializer.
  48. * @return {!goog.format.HtmlPrettyPrinter} Singleton.
  49. * @private
  50. */
  51. goog.format.HtmlPrettyPrinter.getInstance_ = function() {
  52. if (!goog.format.HtmlPrettyPrinter.instance_) {
  53. goog.format.HtmlPrettyPrinter.instance_ =
  54. new goog.format.HtmlPrettyPrinter();
  55. }
  56. return goog.format.HtmlPrettyPrinter.instance_;
  57. };
  58. /**
  59. * Static utility function. See prototype #format.
  60. * @param {string} html The HTML text to pretty print.
  61. * @return {string} Formatted result.
  62. */
  63. goog.format.HtmlPrettyPrinter.format = function(html) {
  64. return goog.format.HtmlPrettyPrinter.getInstance_().format(html);
  65. };
  66. /**
  67. * List of patterns used to tokenize HTML for pretty printing. Cache
  68. * subexpression for tag name.
  69. * comment|meta-tag|tag|text|other-less-than-characters
  70. * @private {!RegExp}
  71. * @const
  72. */
  73. goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =
  74. /(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^<>]*>|[^<]+|<)/g;
  75. /**
  76. * Tags whose contents we don't want pretty printed.
  77. * @private {!Object}
  78. * @const
  79. */
  80. goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(
  81. goog.dom.TagName.SCRIPT, goog.dom.TagName.STYLE, goog.dom.TagName.PRE,
  82. 'XMP');
  83. /**
  84. * 'Block' tags. We should add newlines before and after these tags during
  85. * pretty printing. Tags drawn mostly from HTML4 definitions for block and other
  86. * non-online tags, excepting the ones in
  87. * #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.
  88. * @private {!Object}
  89. * @const
  90. */
  91. goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(
  92. goog.dom.TagName.ADDRESS, goog.dom.TagName.APPLET, goog.dom.TagName.AREA,
  93. goog.dom.TagName.BASE, goog.dom.TagName.BASEFONT,
  94. goog.dom.TagName.BLOCKQUOTE, goog.dom.TagName.BODY,
  95. goog.dom.TagName.CAPTION, goog.dom.TagName.CENTER, goog.dom.TagName.COL,
  96. goog.dom.TagName.COLGROUP, goog.dom.TagName.DIR, goog.dom.TagName.DIV,
  97. goog.dom.TagName.DL, goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM,
  98. goog.dom.TagName.FRAME, goog.dom.TagName.FRAMESET, goog.dom.TagName.H1,
  99. goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
  100. goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HEAD,
  101. goog.dom.TagName.HR, goog.dom.TagName.HTML, goog.dom.TagName.IFRAME,
  102. goog.dom.TagName.ISINDEX, goog.dom.TagName.LEGEND, goog.dom.TagName.LINK,
  103. goog.dom.TagName.MENU, goog.dom.TagName.META, goog.dom.TagName.NOFRAMES,
  104. goog.dom.TagName.NOSCRIPT, goog.dom.TagName.OL, goog.dom.TagName.OPTGROUP,
  105. goog.dom.TagName.OPTION, goog.dom.TagName.P, goog.dom.TagName.PARAM,
  106. goog.dom.TagName.TABLE, goog.dom.TagName.TBODY, goog.dom.TagName.TD,
  107. goog.dom.TagName.TFOOT, goog.dom.TagName.TH, goog.dom.TagName.THEAD,
  108. goog.dom.TagName.TITLE, goog.dom.TagName.TR, goog.dom.TagName.UL);
  109. /**
  110. * Non-block tags that break flow. We insert a line break after, but not before
  111. * these. Tags drawn from HTML4 definitions.
  112. * @private {!Object}
  113. * @const
  114. */
  115. goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(
  116. goog.dom.TagName.BR, goog.dom.TagName.DD, goog.dom.TagName.DT,
  117. goog.dom.TagName.LI, goog.dom.TagName.NOFRAMES);
  118. /**
  119. * Empty tags. These are treated as both start and end tags.
  120. * @private {!Object}
  121. * @const
  122. */
  123. goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(
  124. goog.dom.TagName.BR, goog.dom.TagName.HR, goog.dom.TagName.ISINDEX);
  125. /**
  126. * Breaks up HTML so it's easily readable by the user.
  127. * @param {string} html The HTML text to pretty print.
  128. * @return {string} Formatted result.
  129. * @throws {Error} Regex error, data loss, or endless loop detected.
  130. */
  131. goog.format.HtmlPrettyPrinter.prototype.format = function(html) {
  132. // Trim leading whitespace, but preserve first indent; in other words, keep
  133. // any spaces immediately before the first non-whitespace character (that's
  134. // what $1 is), but remove all other leading whitespace. This adjustment
  135. // historically had been made in Docs. The motivation is that some
  136. // browsers prepend several line breaks in designMode.
  137. html = html.replace(/^\s*?( *\S)/, '$1');
  138. // Trim trailing whitespace.
  139. html = html.replace(/\s+$/, '');
  140. // Keep track of how much time we've used.
  141. var timeOutMillis = this.timeOutMillis_;
  142. var startMillis = timeOutMillis ? goog.now() : 0;
  143. // Handles concatenation of the result and required line breaks.
  144. var buffer = new goog.format.HtmlPrettyPrinter.Buffer();
  145. // Declare these for efficiency since we access them in a loop.
  146. var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;
  147. var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;
  148. var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;
  149. var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;
  150. var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;
  151. // Used to verify we're making progress through our regex tokenization.
  152. var lastIndex = 0;
  153. // Use this to track non-pretty-printed tags and children.
  154. var nonPpTagStack = [];
  155. // Loop through each matched token.
  156. var match;
  157. while (match = tokenRegex.exec(html)) {
  158. // Get token.
  159. var token = match[0];
  160. // Is this token a tag? match.length == 3 for tags, 1 for all others.
  161. if (match.length == 3) {
  162. var tagName = match[2];
  163. if (tagName) {
  164. tagName = tagName.toUpperCase();
  165. }
  166. // Non-pretty-printed tags?
  167. if (nonPpTags.hasOwnProperty(tagName)) {
  168. // End tag?
  169. if (match[1] == '/') {
  170. // Do we have a matching start tag?
  171. var stackSize = nonPpTagStack.length;
  172. var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;
  173. if (startTagName == tagName) {
  174. // End of non-pretty-printed block. Line break after.
  175. nonPpTagStack.pop();
  176. buffer.pushToken(false, token, !nonPpTagStack.length);
  177. } else {
  178. // Malformed HTML. No line breaks.
  179. buffer.pushToken(false, token, false);
  180. }
  181. } else {
  182. // Start of non-pretty-printed block. Line break before.
  183. buffer.pushToken(!nonPpTagStack.length, token, false);
  184. nonPpTagStack.push(tagName);
  185. }
  186. } else if (nonPpTagStack.length) {
  187. // Inside non-pretty-printed block, no new line breaks.
  188. buffer.pushToken(false, token, false);
  189. } else if (blockTags.hasOwnProperty(tagName)) {
  190. // Put line break before start block and after end block tags.
  191. var isEmpty = emptyTags.hasOwnProperty(tagName);
  192. var isEndTag = match[1] == '/';
  193. buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);
  194. } else if (breaksFlowTags.hasOwnProperty(tagName)) {
  195. var isEmpty = emptyTags.hasOwnProperty(tagName);
  196. var isEndTag = match[1] == '/';
  197. // Put line break after end flow-breaking tags.
  198. buffer.pushToken(false, token, isEndTag || isEmpty);
  199. } else {
  200. // All other tags, no line break.
  201. buffer.pushToken(false, token, false);
  202. }
  203. } else {
  204. // Non-tags, no line break.
  205. buffer.pushToken(false, token, false);
  206. }
  207. // Double check that we're making progress.
  208. var newLastIndex = tokenRegex.lastIndex;
  209. if (!token || newLastIndex <= lastIndex) {
  210. throw Error('Regex failed to make progress through source html.');
  211. }
  212. lastIndex = newLastIndex;
  213. // Out of time?
  214. if (timeOutMillis) {
  215. if (goog.now() - startMillis > timeOutMillis) {
  216. // Push unprocessed data as one big token and reset regex object.
  217. buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);
  218. tokenRegex.lastIndex = 0;
  219. break;
  220. }
  221. }
  222. }
  223. // Ensure we end in a line break.
  224. buffer.lineBreak();
  225. // Construct result string.
  226. var result = String(buffer);
  227. // Length should be original length plus # line breaks added.
  228. var expectedLength = html.length + buffer.breakCount;
  229. if (result.length != expectedLength) {
  230. throw Error('Lost data pretty printing html.');
  231. }
  232. return result;
  233. };
  234. /**
  235. * This class is a buffer to which we push our output. It tracks line breaks to
  236. * make sure we don't add unnecessary ones.
  237. * @constructor
  238. * @final
  239. */
  240. goog.format.HtmlPrettyPrinter.Buffer = function() {
  241. /**
  242. * Tokens to be output in #toString.
  243. * @type {goog.string.StringBuffer}
  244. * @private
  245. */
  246. this.out_ = new goog.string.StringBuffer();
  247. };
  248. /**
  249. * Tracks number of line breaks added.
  250. * @type {number}
  251. */
  252. goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;
  253. /**
  254. * Tracks if we are at the start of a new line.
  255. * @type {boolean}
  256. * @private
  257. */
  258. goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;
  259. /**
  260. * Tracks if we need a new line before the next token.
  261. * @type {boolean}
  262. * @private
  263. */
  264. goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;
  265. /**
  266. * Adds token and necessary line breaks to output buffer.
  267. * @param {boolean} breakBefore If true, add line break before token if
  268. * necessary.
  269. * @param {string} token Token to push.
  270. * @param {boolean} breakAfter If true, add line break after token if
  271. * necessary.
  272. */
  273. goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(
  274. breakBefore, token, breakAfter) {
  275. // If this token needs a preceding line break, and
  276. // we haven't already added a line break, and
  277. // this token does not start with a line break,
  278. // then add line break.
  279. // Due to FF3.0 bug with lists, we don't insert a /n
  280. // right before </ul>. See bug 1520665.
  281. if ((this.needsNewLine_ || breakBefore) && !/^\r?\n/.test(token) &&
  282. !/\/ul/i.test(token)) {
  283. this.lineBreak();
  284. }
  285. // Token.
  286. this.out_.append(token);
  287. // Remember if this string ended with a line break so we know we don't have to
  288. // insert another one before the next token.
  289. this.isBeginningOfNewLine_ = /\r?\n$/.test(token);
  290. // Remember if this token requires a line break after it. We don't insert it
  291. // here because we might not have to if the next token starts with a line
  292. // break.
  293. this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;
  294. };
  295. /**
  296. * Append line break if we need one.
  297. */
  298. goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {
  299. if (!this.isBeginningOfNewLine_) {
  300. this.out_.append('\n');
  301. ++this.breakCount;
  302. }
  303. };
  304. /**
  305. * @return {string} String representation of tokens.
  306. * @override
  307. */
  308. goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {
  309. return this.out_.toString();
  310. };