123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367 |
- // Copyright 2008 The Closure Library Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS-IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- /**
- * @fileoverview Provides functions to parse and pretty-print HTML strings.
- *
- */
- goog.provide('goog.format.HtmlPrettyPrinter');
- goog.provide('goog.format.HtmlPrettyPrinter.Buffer');
- goog.require('goog.dom.TagName');
- goog.require('goog.object');
- goog.require('goog.string.StringBuffer');
- /**
- * This class formats HTML to be more human-readable.
- * TODO(user): Add hierarchical indentation.
- * @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If
- * this time is exceeded, return partially formatted. 0 or negative number
- * indicates no timeout.
- * @constructor
- * @final
- */
- goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {
- /**
- * Max # milliseconds to spend on #format.
- * @type {number}
- * @private
- */
- this.timeOutMillis_ =
- opt_timeOutMillis && opt_timeOutMillis > 0 ? opt_timeOutMillis : 0;
- };
- /**
- * Singleton.
- * @private {goog.format.HtmlPrettyPrinter?}
- */
- goog.format.HtmlPrettyPrinter.instance_ = null;
- /**
- * Singleton lazy initializer.
- * @return {!goog.format.HtmlPrettyPrinter} Singleton.
- * @private
- */
- goog.format.HtmlPrettyPrinter.getInstance_ = function() {
- if (!goog.format.HtmlPrettyPrinter.instance_) {
- goog.format.HtmlPrettyPrinter.instance_ =
- new goog.format.HtmlPrettyPrinter();
- }
- return goog.format.HtmlPrettyPrinter.instance_;
- };
- /**
- * Static utility function. See prototype #format.
- * @param {string} html The HTML text to pretty print.
- * @return {string} Formatted result.
- */
- goog.format.HtmlPrettyPrinter.format = function(html) {
- return goog.format.HtmlPrettyPrinter.getInstance_().format(html);
- };
- /**
- * List of patterns used to tokenize HTML for pretty printing. Cache
- * subexpression for tag name.
- * comment|meta-tag|tag|text|other-less-than-characters
- * @private {!RegExp}
- * @const
- */
- goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =
- /(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^<>]*>|[^<]+|<)/g;
- /**
- * Tags whose contents we don't want pretty printed.
- * @private {!Object}
- * @const
- */
- goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(
- goog.dom.TagName.SCRIPT, goog.dom.TagName.STYLE, goog.dom.TagName.PRE,
- 'XMP');
- /**
- * 'Block' tags. We should add newlines before and after these tags during
- * pretty printing. Tags drawn mostly from HTML4 definitions for block and other
- * non-online tags, excepting the ones in
- * #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.
- * @private {!Object}
- * @const
- */
- goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(
- goog.dom.TagName.ADDRESS, goog.dom.TagName.APPLET, goog.dom.TagName.AREA,
- goog.dom.TagName.BASE, goog.dom.TagName.BASEFONT,
- goog.dom.TagName.BLOCKQUOTE, goog.dom.TagName.BODY,
- goog.dom.TagName.CAPTION, goog.dom.TagName.CENTER, goog.dom.TagName.COL,
- goog.dom.TagName.COLGROUP, goog.dom.TagName.DIR, goog.dom.TagName.DIV,
- goog.dom.TagName.DL, goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM,
- goog.dom.TagName.FRAME, goog.dom.TagName.FRAMESET, goog.dom.TagName.H1,
- goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
- goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HEAD,
- goog.dom.TagName.HR, goog.dom.TagName.HTML, goog.dom.TagName.IFRAME,
- goog.dom.TagName.ISINDEX, goog.dom.TagName.LEGEND, goog.dom.TagName.LINK,
- goog.dom.TagName.MENU, goog.dom.TagName.META, goog.dom.TagName.NOFRAMES,
- goog.dom.TagName.NOSCRIPT, goog.dom.TagName.OL, goog.dom.TagName.OPTGROUP,
- goog.dom.TagName.OPTION, goog.dom.TagName.P, goog.dom.TagName.PARAM,
- goog.dom.TagName.TABLE, goog.dom.TagName.TBODY, goog.dom.TagName.TD,
- goog.dom.TagName.TFOOT, goog.dom.TagName.TH, goog.dom.TagName.THEAD,
- goog.dom.TagName.TITLE, goog.dom.TagName.TR, goog.dom.TagName.UL);
- /**
- * Non-block tags that break flow. We insert a line break after, but not before
- * these. Tags drawn from HTML4 definitions.
- * @private {!Object}
- * @const
- */
- goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(
- goog.dom.TagName.BR, goog.dom.TagName.DD, goog.dom.TagName.DT,
- goog.dom.TagName.LI, goog.dom.TagName.NOFRAMES);
- /**
- * Empty tags. These are treated as both start and end tags.
- * @private {!Object}
- * @const
- */
- goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(
- goog.dom.TagName.BR, goog.dom.TagName.HR, goog.dom.TagName.ISINDEX);
- /**
- * Breaks up HTML so it's easily readable by the user.
- * @param {string} html The HTML text to pretty print.
- * @return {string} Formatted result.
- * @throws {Error} Regex error, data loss, or endless loop detected.
- */
- goog.format.HtmlPrettyPrinter.prototype.format = function(html) {
- // Trim leading whitespace, but preserve first indent; in other words, keep
- // any spaces immediately before the first non-whitespace character (that's
- // what $1 is), but remove all other leading whitespace. This adjustment
- // historically had been made in Docs. The motivation is that some
- // browsers prepend several line breaks in designMode.
- html = html.replace(/^\s*?( *\S)/, '$1');
- // Trim trailing whitespace.
- html = html.replace(/\s+$/, '');
- // Keep track of how much time we've used.
- var timeOutMillis = this.timeOutMillis_;
- var startMillis = timeOutMillis ? goog.now() : 0;
- // Handles concatenation of the result and required line breaks.
- var buffer = new goog.format.HtmlPrettyPrinter.Buffer();
- // Declare these for efficiency since we access them in a loop.
- var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;
- var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;
- var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;
- var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;
- var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;
- // Used to verify we're making progress through our regex tokenization.
- var lastIndex = 0;
- // Use this to track non-pretty-printed tags and children.
- var nonPpTagStack = [];
- // Loop through each matched token.
- var match;
- while (match = tokenRegex.exec(html)) {
- // Get token.
- var token = match[0];
- // Is this token a tag? match.length == 3 for tags, 1 for all others.
- if (match.length == 3) {
- var tagName = match[2];
- if (tagName) {
- tagName = tagName.toUpperCase();
- }
- // Non-pretty-printed tags?
- if (nonPpTags.hasOwnProperty(tagName)) {
- // End tag?
- if (match[1] == '/') {
- // Do we have a matching start tag?
- var stackSize = nonPpTagStack.length;
- var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;
- if (startTagName == tagName) {
- // End of non-pretty-printed block. Line break after.
- nonPpTagStack.pop();
- buffer.pushToken(false, token, !nonPpTagStack.length);
- } else {
- // Malformed HTML. No line breaks.
- buffer.pushToken(false, token, false);
- }
- } else {
- // Start of non-pretty-printed block. Line break before.
- buffer.pushToken(!nonPpTagStack.length, token, false);
- nonPpTagStack.push(tagName);
- }
- } else if (nonPpTagStack.length) {
- // Inside non-pretty-printed block, no new line breaks.
- buffer.pushToken(false, token, false);
- } else if (blockTags.hasOwnProperty(tagName)) {
- // Put line break before start block and after end block tags.
- var isEmpty = emptyTags.hasOwnProperty(tagName);
- var isEndTag = match[1] == '/';
- buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);
- } else if (breaksFlowTags.hasOwnProperty(tagName)) {
- var isEmpty = emptyTags.hasOwnProperty(tagName);
- var isEndTag = match[1] == '/';
- // Put line break after end flow-breaking tags.
- buffer.pushToken(false, token, isEndTag || isEmpty);
- } else {
- // All other tags, no line break.
- buffer.pushToken(false, token, false);
- }
- } else {
- // Non-tags, no line break.
- buffer.pushToken(false, token, false);
- }
- // Double check that we're making progress.
- var newLastIndex = tokenRegex.lastIndex;
- if (!token || newLastIndex <= lastIndex) {
- throw Error('Regex failed to make progress through source html.');
- }
- lastIndex = newLastIndex;
- // Out of time?
- if (timeOutMillis) {
- if (goog.now() - startMillis > timeOutMillis) {
- // Push unprocessed data as one big token and reset regex object.
- buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);
- tokenRegex.lastIndex = 0;
- break;
- }
- }
- }
- // Ensure we end in a line break.
- buffer.lineBreak();
- // Construct result string.
- var result = String(buffer);
- // Length should be original length plus # line breaks added.
- var expectedLength = html.length + buffer.breakCount;
- if (result.length != expectedLength) {
- throw Error('Lost data pretty printing html.');
- }
- return result;
- };
- /**
- * This class is a buffer to which we push our output. It tracks line breaks to
- * make sure we don't add unnecessary ones.
- * @constructor
- * @final
- */
- goog.format.HtmlPrettyPrinter.Buffer = function() {
- /**
- * Tokens to be output in #toString.
- * @type {goog.string.StringBuffer}
- * @private
- */
- this.out_ = new goog.string.StringBuffer();
- };
- /**
- * Tracks number of line breaks added.
- * @type {number}
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;
- /**
- * Tracks if we are at the start of a new line.
- * @type {boolean}
- * @private
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;
- /**
- * Tracks if we need a new line before the next token.
- * @type {boolean}
- * @private
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;
- /**
- * Adds token and necessary line breaks to output buffer.
- * @param {boolean} breakBefore If true, add line break before token if
- * necessary.
- * @param {string} token Token to push.
- * @param {boolean} breakAfter If true, add line break after token if
- * necessary.
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(
- breakBefore, token, breakAfter) {
- // If this token needs a preceding line break, and
- // we haven't already added a line break, and
- // this token does not start with a line break,
- // then add line break.
- // Due to FF3.0 bug with lists, we don't insert a /n
- // right before </ul>. See bug 1520665.
- if ((this.needsNewLine_ || breakBefore) && !/^\r?\n/.test(token) &&
- !/\/ul/i.test(token)) {
- this.lineBreak();
- }
- // Token.
- this.out_.append(token);
- // Remember if this string ended with a line break so we know we don't have to
- // insert another one before the next token.
- this.isBeginningOfNewLine_ = /\r?\n$/.test(token);
- // Remember if this token requires a line break after it. We don't insert it
- // here because we might not have to if the next token starts with a line
- // break.
- this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;
- };
- /**
- * Append line break if we need one.
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {
- if (!this.isBeginningOfNewLine_) {
- this.out_.append('\n');
- ++this.breakCount;
- }
- };
- /**
- * @return {string} String representation of tokens.
- * @override
- */
- goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {
- return this.out_.toString();
- };
|