uchar.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. // Copyright 2009 The Closure Library Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS-IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /**
  15. * @fileoverview Collection of utility functions for Unicode character.
  16. *
  17. */
  18. goog.provide('goog.i18n.uChar');
  19. // Constants for handling Unicode supplementary characters (surrogate pairs).
  20. /**
  21. * The minimum value for Supplementary code points.
  22. * @type {number}
  23. * @private
  24. */
  25. goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ = 0x10000;
  26. /**
  27. * The highest Unicode code point value (scalar value) according to the Unicode
  28. * Standard.
  29. * @type {number}
  30. * @private
  31. */
  32. goog.i18n.uChar.CODE_POINT_MAX_VALUE_ = 0x10FFFF;
  33. /**
  34. * Lead surrogate minimum value.
  35. * @type {number}
  36. * @private
  37. */
  38. goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
  39. /**
  40. * Lead surrogate maximum value.
  41. * @type {number}
  42. * @private
  43. */
  44. goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_ = 0xDBFF;
  45. /**
  46. * Trail surrogate minimum value.
  47. * @type {number}
  48. * @private
  49. */
  50. goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
  51. /**
  52. * Trail surrogate maximum value.
  53. * @type {number}
  54. * @private
  55. */
  56. goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
  57. /**
  58. * The number of least significant bits of a supplementary code point that in
  59. * UTF-16 become the least significant bits of the trail surrogate. The rest of
  60. * the in-use bits of the supplementary code point become the least significant
  61. * bits of the lead surrogate.
  62. * @type {number}
  63. * @private
  64. */
  65. goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_ = 10;
  66. /**
  67. * Gets the U+ notation string of a Unicode character. Ex: 'U+0041' for 'A'.
  68. * @param {string} ch The given character.
  69. * @return {string} The U+ notation of the given character.
  70. */
  71. goog.i18n.uChar.toHexString = function(ch) {
  72. var chCode = goog.i18n.uChar.toCharCode(ch);
  73. var chCodeStr = 'U+' +
  74. goog.i18n.uChar.padString_(chCode.toString(16).toUpperCase(), 4, '0');
  75. return chCodeStr;
  76. };
  77. /**
  78. * Gets a string padded with given character to get given size.
  79. * @param {string} str The given string to be padded.
  80. * @param {number} length The target size of the string.
  81. * @param {string} ch The character to be padded with.
  82. * @return {string} The padded string.
  83. * @private
  84. */
  85. goog.i18n.uChar.padString_ = function(str, length, ch) {
  86. while (str.length < length) {
  87. str = ch + str;
  88. }
  89. return str;
  90. };
  91. /**
  92. * Gets Unicode value of the given character.
  93. * @param {string} ch The given character, which in the case of a supplementary
  94. * character is actually a surrogate pair. The remainder of the string is
  95. * ignored.
  96. * @return {number} The Unicode value of the character.
  97. */
  98. goog.i18n.uChar.toCharCode = function(ch) {
  99. return goog.i18n.uChar.getCodePointAround(ch, 0);
  100. };
  101. /**
  102. * Gets a character from the given Unicode value. If the given code point is not
  103. * a valid Unicode code point, null is returned.
  104. * @param {number} code The Unicode value of the character.
  105. * @return {?string} The character corresponding to the given Unicode value.
  106. */
  107. goog.i18n.uChar.fromCharCode = function(code) {
  108. if (!goog.isDefAndNotNull(code) ||
  109. !(code >= 0 && code <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_)) {
  110. return null;
  111. }
  112. if (goog.i18n.uChar.isSupplementaryCodePoint(code)) {
  113. // First, we split the code point into the trail surrogate part (the
  114. // TRAIL_SURROGATE_BIT_COUNT_ least significant bits) and the lead surrogate
  115. // part (the rest of the bits, shifted down; note that for now this includes
  116. // the supplementary offset, also shifted down, to be subtracted off below).
  117. var leadBits = code >> goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_;
  118. var trailBits = code &
  119. // A bit-mask to get the TRAIL_SURROGATE_BIT_COUNT_ (i.e. 10) least
  120. // significant bits. 1 << 10 = 0x0400. 0x0400 - 1 = 0x03FF.
  121. ((1 << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) - 1);
  122. // Now we calculate the code point of each surrogate by adding each offset
  123. // to the corresponding base code point.
  124. var leadCodePoint = leadBits +
  125. (goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ -
  126. // Subtract off the supplementary offset, which had been shifted down
  127. // with the rest of leadBits. We do this here instead of before the
  128. // shift in order to save a separate subtraction step.
  129. (goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ >>
  130. goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_));
  131. var trailCodePoint = trailBits + goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_;
  132. // Convert the code points into a 2-character long string.
  133. return String.fromCharCode(leadCodePoint) +
  134. String.fromCharCode(trailCodePoint);
  135. }
  136. return String.fromCharCode(code);
  137. };
  138. /**
  139. * Returns the Unicode code point at the specified index.
  140. *
  141. * If the char value specified at the given index is in the leading-surrogate
  142. * range, and the following index is less than the length of {@code string}, and
  143. * the char value at the following index is in the trailing-surrogate range,
  144. * then the supplementary code point corresponding to this surrogate pair is
  145. * returned.
  146. *
  147. * If the char value specified at the given index is in the trailing-surrogate
  148. * range, and the preceding index is not before the start of {@code string}, and
  149. * the char value at the preceding index is in the leading-surrogate range, then
  150. * the negated supplementary code point corresponding to this surrogate pair is
  151. * returned.
  152. *
  153. * The negation allows the caller to differentiate between the case where the
  154. * given index is at the leading surrogate and the one where it is at the
  155. * trailing surrogate, and thus deduce where the next character starts and
  156. * preceding character ends.
  157. *
  158. * Otherwise, the char value at the given index is returned. Thus, a leading
  159. * surrogate is returned when it is not followed by a trailing surrogate, and a
  160. * trailing surrogate is returned when it is not preceded by a leading
  161. * surrogate.
  162. *
  163. * @param {string} string The string.
  164. * @param {number} index The index from which the code point is to be retrieved.
  165. * @return {number} The code point at the given index. If the given index is
  166. * that of the start (i.e. lead surrogate) of a surrogate pair, returns the code
  167. * point encoded by the pair. If the given index is that of the end (i.e. trail
  168. * surrogate) of a surrogate pair, returns the negated code pointed encoded by
  169. * the pair.
  170. */
  171. goog.i18n.uChar.getCodePointAround = function(string, index) {
  172. var charCode = string.charCodeAt(index);
  173. if (goog.i18n.uChar.isLeadSurrogateCodePoint(charCode) &&
  174. index + 1 < string.length) {
  175. var trail = string.charCodeAt(index + 1);
  176. if (goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
  177. // Part of a surrogate pair.
  178. return /** @type {number} */ (
  179. goog.i18n.uChar.buildSupplementaryCodePoint(charCode, trail));
  180. }
  181. } else if (goog.i18n.uChar.isTrailSurrogateCodePoint(charCode) && index > 0) {
  182. var lead = string.charCodeAt(index - 1);
  183. if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead)) {
  184. // Part of a surrogate pair.
  185. return /** @type {number} */ (
  186. -goog.i18n.uChar.buildSupplementaryCodePoint(lead, charCode));
  187. }
  188. }
  189. return charCode;
  190. };
  191. /**
  192. * Determines the length of the string needed to represent the specified
  193. * Unicode code point.
  194. * @param {number} codePoint
  195. * @return {number} 2 if codePoint is a supplementary character, 1 otherwise.
  196. */
  197. goog.i18n.uChar.charCount = function(codePoint) {
  198. return goog.i18n.uChar.isSupplementaryCodePoint(codePoint) ? 2 : 1;
  199. };
  200. /**
  201. * Determines whether the specified Unicode code point is in the supplementary
  202. * Unicode characters range.
  203. * @param {number} codePoint
  204. * @return {boolean} Whether then given code point is a supplementary character.
  205. */
  206. goog.i18n.uChar.isSupplementaryCodePoint = function(codePoint) {
  207. return codePoint >= goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ &&
  208. codePoint <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_;
  209. };
  210. /**
  211. * Gets whether the given code point is a leading surrogate character.
  212. * @param {number} codePoint
  213. * @return {boolean} Whether the given code point is a leading surrogate
  214. * character.
  215. */
  216. goog.i18n.uChar.isLeadSurrogateCodePoint = function(codePoint) {
  217. return codePoint >= goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ &&
  218. codePoint <= goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_;
  219. };
  220. /**
  221. * Gets whether the given code point is a trailing surrogate character.
  222. * @param {number} codePoint
  223. * @return {boolean} Whether the given code point is a trailing surrogate
  224. * character.
  225. */
  226. goog.i18n.uChar.isTrailSurrogateCodePoint = function(codePoint) {
  227. return codePoint >= goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ &&
  228. codePoint <= goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_;
  229. };
  230. /**
  231. * Composes a supplementary Unicode code point from the given UTF-16 surrogate
  232. * pair. If leadSurrogate isn't a leading surrogate code point or trailSurrogate
  233. * isn't a trailing surrogate code point, null is returned.
  234. * @param {number} lead The leading surrogate code point.
  235. * @param {number} trail The trailing surrogate code point.
  236. * @return {?number} The supplementary Unicode code point obtained by decoding
  237. * the given UTF-16 surrogate pair.
  238. */
  239. goog.i18n.uChar.buildSupplementaryCodePoint = function(lead, trail) {
  240. if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead) &&
  241. goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
  242. var shiftedLeadOffset =
  243. (lead << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) -
  244. (goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_
  245. << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_);
  246. var trailOffset = trail - goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ +
  247. goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_;
  248. return shiftedLeadOffset + trailOffset;
  249. }
  250. return null;
  251. };