123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- // Copyright 2017 The Closure Library Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS-IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- /**
- * @fileoverview Contains utility methods to extract text content from HTML.
- * @supported IE 10+, Chrome 26+, Firefox 22+, Safari 7.1+, Opera 15+
- */
- goog.provide('goog.html.textExtractor');
- goog.require('goog.array');
- goog.require('goog.dom.TagName');
- goog.require('goog.html.sanitizer.HtmlSanitizer');
- goog.require('goog.object');
- goog.require('goog.userAgent');
- /**
- * Safely extracts text from an untrusted HTML string using the HtmlSanitizer.
- * Compared to goog.html.utils.stripHtmlTags, it tries to be smarter about
- * printing newlines between blocks and leave out textual content that would not
- * be displayed to the user (such as SCRIPT and STYLE tags).
- * @param {string} html The untrusted HTML string.
- * @return {string}
- */
- // TODO(pelizzi): consider an optional bool parameter to also extract the text
- // content of alt attributes and such.
- goog.html.textExtractor.extractTextContent = function(html) {
- if (!goog.html.textExtractor.isSupported()) {
- return '';
- }
- // Disable all attributes except style to protect against DOM clobbering.
- var sanitizer = new goog.html.sanitizer.HtmlSanitizer.Builder()
- .onlyAllowAttributes(['style'])
- .allowCssStyles()
- .build();
- // The default policy of the sanitizer strips the content of tags such as
- // SCRIPT and STYLE, whose non-textual content would otherwise end up in the
- // extracted text.
- var sanitizedNodes = sanitizer.sanitizeToDomNode(html);
- // textContent and innerText do not handle spacing between block elements
- // properly. We need to reimplement a similar algorithm ourselves and account
- // for spacing between block elements.
- return goog.html.textExtractor.extractTextContentFromNode_(sanitizedNodes)
- .trim();
- };
- /**
- * Recursively extract text from the supplied DOM node and its descendants.
- * @param {!Node} node
- * @return {string}
- * @private
- */
- goog.html.textExtractor.extractTextContentFromNode_ = function(node) {
- switch (node.nodeType) {
- case Node.ELEMENT_NODE:
- var element = /** @type {!Element} */ (node);
- if (element.tagName == goog.dom.TagName.BR) {
- return '\n';
- }
- var result = goog.array
- .map(
- node.childNodes,
- goog.html.textExtractor.extractTextContentFromNode_)
- .join('');
- if (goog.html.textExtractor.isBlockElement_(element)) {
- result = '\n' + result + '\n';
- }
- return result;
- case Node.TEXT_NODE:
- return node.nodeValue.replace(/\s+/g, ' ').trim();
- default:
- return '';
- }
- };
- /**
- * A set of block elements.
- * @private @const {!Object<!goog.dom.TagName, boolean>}
- */
- goog.html.textExtractor.BLOCK_ELEMENTS_ = goog.object.createSet(
- goog.dom.TagName.ADDRESS, goog.dom.TagName.BLOCKQUOTE,
- goog.dom.TagName.CENTER, goog.dom.TagName.DIV, goog.dom.TagName.DL,
- goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM, goog.dom.TagName.H1,
- goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
- goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HR,
- goog.dom.TagName.OL, goog.dom.TagName.P, goog.dom.TagName.PRE,
- goog.dom.TagName.TABLE, goog.dom.TagName.UL);
- /**
- * Returns true whether this is a block element, i.e. the browser would visually
- * separate the text content from the text content of the previous node.
- * @param {!Element} element
- * @return {boolean}
- * @private
- */
- goog.html.textExtractor.isBlockElement_ = function(element) {
- return element.style.display == 'block' ||
- goog.html.textExtractor.BLOCK_ELEMENTS_.hasOwnProperty(element.tagName);
- };
- /**
- * Whether the browser supports the text extractor. The extractor depends on the
- * HTML Sanitizer, which only supports IE starting from version 10.
- * Visible for testing.
- * @return {boolean}
- * @package
- */
- goog.html.textExtractor.isSupported = function() {
- return !goog.userAgent.IE || goog.userAgent.isVersionOrHigher(10);
- };
|