utils.js 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. // Copyright 2013 The Closure Library Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS-IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /**
  15. * @fileoverview HTML processing utilities for HTML in string form.
  16. */
  17. goog.provide('goog.html.utils');
  18. goog.require('goog.string');
  19. /**
  20. * Extracts plain text from HTML.
  21. *
  22. * This behaves similarly to extracting textContent from a hypothetical DOM
  23. * element containing the specified HTML. Block-level elements such as div are
  24. * surrounded with whitespace, but inline elements are not. Span is treated as
  25. * a block level element because it is often used as a container. Breaking
  26. * spaces are compressed and trimmed.
  27. *
  28. * @param {string} value The input HTML to have tags removed.
  29. * @return {string} The plain text of value without tags, HTML comments, or
  30. * other non-text content. Does NOT return safe HTML!
  31. */
  32. goog.html.utils.stripHtmlTags = function(value) {
  33. // TODO(user): Make a version that extracts text attributes such as alt.
  34. return goog.string.unescapeEntities(
  35. goog.string.trim(
  36. value
  37. .replace(
  38. goog.html.utils.HTML_TAG_REGEX_,
  39. function(fullMatch, tagName) {
  40. return goog.html.utils.INLINE_HTML_TAG_REGEX_.test(
  41. tagName) ?
  42. '' :
  43. ' ';
  44. })
  45. .replace(/[\t\n ]+/g, ' ')));
  46. };
  47. /**
  48. * Matches all tags that do not require extra space.
  49. *
  50. * @private @const
  51. */
  52. goog.html.utils.INLINE_HTML_TAG_REGEX_ =
  53. /^(?:abbr|acronym|address|b|em|i|small|strong|su[bp]|u)$/i;
  54. /**
  55. * Matches all tags, HTML comments, and DOCTYPEs in tag soup HTML.
  56. * By removing these, and replacing any '<' or '>' characters with
  57. * entities we guarantee that the result can be embedded into
  58. * an attribute without introducing a tag boundary.
  59. *
  60. * @private @const
  61. */
  62. goog.html.utils.HTML_TAG_REGEX_ = /<[!\/]?([a-z0-9]+)([\/ ][^>]*)?>/gi;