graphemebreak.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. // Copyright 2006 The Closure Library Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS-IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /**
  15. * @fileoverview Detect Grapheme Cluster Break in a pair of codepoints. Follows
  16. * Unicode 5.1 UAX#29. Tailoring for Virama × Indic Consonants is used.
  17. *
  18. */
  19. goog.provide('goog.i18n.GraphemeBreak');
  20. goog.require('goog.structs.InversionMap');
  21. /**
  22. * Enum for all Grapheme Cluster Break properties.
  23. * These enums directly corresponds to Grapheme_Cluster_Break property values
  24. * mentioned in http://unicode.org/reports/tr29 table 2. VIRAMA and
  25. * INDIC_CONSONANT are for the Virama × Base tailoring mentioned in the notes.
  26. *
  27. * CR and LF are moved to the bottom of the list because they occur only once
  28. * and so good candidates to take 2 decimal digit values.
  29. * @enum {number}
  30. * @protected
  31. */
  32. goog.i18n.GraphemeBreak.property = {
  33. ANY: 0,
  34. CONTROL: 1,
  35. EXTEND: 2,
  36. PREPEND: 3,
  37. SPACING_MARK: 4,
  38. INDIC_CONSONANT: 5,
  39. VIRAMA: 6,
  40. L: 7,
  41. V: 8,
  42. T: 9,
  43. LV: 10,
  44. LVT: 11,
  45. CR: 12,
  46. LF: 13,
  47. REGIONAL_INDICATOR: 14
  48. };
  49. /**
  50. * Grapheme Cluster Break property values for all codepoints as inversion map.
  51. * Constructed lazily.
  52. *
  53. * @type {goog.structs.InversionMap}
  54. * @private
  55. */
  56. goog.i18n.GraphemeBreak.inversions_ = null;
  57. /**
  58. * There are two kinds of grapheme clusters: 1) Legacy 2)Extended. This method
  59. * is to check for legacy rules.
  60. *
  61. * @param {number} prop_a The property enum value of the first character.
  62. * @param {number} prop_b The property enum value of the second character.
  63. * @return {boolean} True if a & b do not form a cluster; False otherwise.
  64. * @private
  65. */
  66. goog.i18n.GraphemeBreak.applyLegacyBreakRules_ = function(prop_a, prop_b) {
  67. var prop = goog.i18n.GraphemeBreak.property;
  68. if (prop_a == prop.CR && prop_b == prop.LF) {
  69. return false;
  70. }
  71. if (prop_a == prop.CONTROL || prop_a == prop.CR || prop_a == prop.LF) {
  72. return true;
  73. }
  74. if (prop_b == prop.CONTROL || prop_b == prop.CR || prop_b == prop.LF) {
  75. return true;
  76. }
  77. if ((prop_a == prop.L) && (prop_b == prop.L || prop_b == prop.V ||
  78. prop_b == prop.LV || prop_b == prop.LVT)) {
  79. return false;
  80. }
  81. if ((prop_a == prop.LV || prop_a == prop.V) &&
  82. (prop_b == prop.V || prop_b == prop.T)) {
  83. return false;
  84. }
  85. if ((prop_a == prop.LVT || prop_a == prop.T) && (prop_b == prop.T)) {
  86. return false;
  87. }
  88. if (prop_b == prop.EXTEND || prop_b == prop.VIRAMA) {
  89. return false;
  90. }
  91. if (prop_a == prop.VIRAMA && prop_b == prop.INDIC_CONSONANT) {
  92. return false;
  93. }
  94. return true;
  95. };
  96. /**
  97. * Method to return property enum value of the codepoint. If it is Hangul LV or
  98. * LVT, then it is computed; for the rest it is picked from the inversion map.
  99. * @param {number} acode The code point value of the character.
  100. * @return {number} Property enum value of codepoint.
  101. * @private
  102. */
  103. goog.i18n.GraphemeBreak.getBreakProp_ = function(acode) {
  104. if (0xAC00 <= acode && acode <= 0xD7A3) {
  105. var prop = goog.i18n.GraphemeBreak.property;
  106. if (acode % 0x1C == 0x10) {
  107. return prop.LV;
  108. }
  109. return prop.LVT;
  110. } else {
  111. if (!goog.i18n.GraphemeBreak.inversions_) {
  112. goog.i18n.GraphemeBreak.inversions_ = new goog.structs.InversionMap(
  113. [
  114. 0, 10, 1, 2, 1, 18, 95, 33, 13, 1, 594,
  115. 112, 275, 7, 263, 45, 1, 1, 1, 2, 1, 2,
  116. 1, 1, 56, 5, 11, 11, 48, 21, 16, 1, 101,
  117. 7, 1, 1, 6, 2, 2, 1, 4, 33, 1, 1,
  118. 1, 30, 27, 91, 11, 58, 9, 34, 4, 1, 9,
  119. 1, 3, 1, 5, 43, 3, 136, 31, 1, 17, 37,
  120. 1, 1, 1, 1, 3, 8, 4, 1, 2, 1, 7,
  121. 8, 2, 2, 21, 8, 1, 2, 17, 39, 1, 1,
  122. 1, 2, 6, 6, 1, 9, 5, 4, 2, 2, 12,
  123. 2, 15, 2, 1, 17, 39, 2, 3, 12, 4, 8,
  124. 6, 17, 2, 3, 14, 1, 17, 39, 1, 1, 3,
  125. 8, 4, 1, 20, 2, 29, 1, 2, 17, 39, 1,
  126. 1, 2, 1, 6, 6, 9, 6, 4, 2, 2, 13,
  127. 1, 16, 1, 18, 41, 1, 1, 1, 12, 1, 9,
  128. 1, 41, 3, 17, 37, 4, 3, 5, 7, 8, 3,
  129. 2, 8, 2, 30, 2, 17, 39, 1, 1, 1, 1,
  130. 2, 1, 3, 1, 5, 1, 8, 9, 1, 3, 2,
  131. 30, 2, 17, 38, 3, 1, 2, 5, 7, 1, 9,
  132. 1, 10, 2, 30, 2, 22, 48, 5, 1, 2, 6,
  133. 7, 19, 2, 13, 46, 2, 1, 1, 1, 6, 1,
  134. 12, 8, 50, 46, 2, 1, 1, 1, 9, 11, 6,
  135. 14, 2, 58, 2, 27, 1, 1, 1, 1, 1, 4,
  136. 2, 49, 14, 1, 4, 1, 1, 2, 5, 48, 9,
  137. 1, 57, 33, 12, 4, 1, 6, 1, 2, 2, 2,
  138. 1, 16, 2, 4, 2, 2, 4, 3, 1, 3, 2,
  139. 7, 3, 4, 13, 1, 1, 1, 2, 6, 1, 1,
  140. 14, 1, 98, 96, 72, 88, 349, 3, 931, 15, 2,
  141. 1, 14, 15, 2, 1, 14, 15, 2, 15, 15, 14,
  142. 35, 17, 2, 1, 7, 8, 1, 2, 9, 1, 1,
  143. 9, 1, 45, 3, 155, 1, 87, 31, 3, 4, 2,
  144. 9, 1, 6, 3, 20, 19, 29, 44, 9, 3, 2,
  145. 1, 69, 23, 2, 3, 4, 45, 6, 2, 1, 1,
  146. 1, 8, 1, 1, 1, 2, 8, 6, 13, 128, 4,
  147. 1, 14, 33, 1, 1, 5, 1, 1, 5, 1, 1,
  148. 1, 7, 31, 9, 12, 2, 1, 7, 23, 1, 4,
  149. 2, 2, 2, 2, 2, 11, 3, 2, 36, 2, 1,
  150. 1, 2, 3, 1, 1, 3, 2, 12, 36, 8, 8,
  151. 2, 2, 21, 3, 128, 3, 1, 13, 1, 7, 4,
  152. 1, 4, 2, 1, 203, 64, 523, 1, 2, 2, 24,
  153. 7, 49, 16, 96, 33, 3070, 3, 141, 1, 96, 32,
  154. 554, 6, 105, 2, 30164, 4, 1, 10, 33, 1, 80,
  155. 2, 272, 1, 3, 1, 4, 1, 23, 2, 2, 1,
  156. 24, 30, 4, 4, 3, 8, 1, 1, 13, 2, 16,
  157. 34, 16, 1, 27, 18, 24, 24, 4, 8, 2, 23,
  158. 11, 1, 1, 12, 32, 3, 1, 5, 3, 3, 36,
  159. 1, 2, 4, 2, 1, 3, 1, 69, 35, 6, 2,
  160. 2, 2, 2, 12, 1, 8, 1, 1, 18, 16, 1,
  161. 3, 6, 1, 5, 48, 1, 1, 3, 2, 2, 5,
  162. 2, 1, 1, 32, 9, 1, 2, 2, 5, 1, 1,
  163. 201, 14, 2, 1, 1, 9, 8, 2, 1, 2, 1,
  164. 2, 1, 1, 1, 18, 11184, 27, 49, 1028, 1024, 6942,
  165. 1, 737, 16, 16, 7, 216, 1, 158, 2, 89, 3,
  166. 513, 1, 2051, 15, 40, 7, 1, 1472, 1, 1, 1,
  167. 53, 14, 1, 57, 2, 1, 45, 3, 4, 2, 1,
  168. 1, 2, 1, 66, 3, 36, 5, 1, 6, 2, 75,
  169. 2, 1, 48, 3, 9, 1, 1, 1258, 1, 1, 1,
  170. 2, 6, 1, 1, 22681, 62, 4, 25042, 1, 1, 3,
  171. 3, 1, 5, 8, 8, 2, 7, 30, 4, 148, 3,
  172. 8097, 26, 790017, 255
  173. ],
  174. [
  175. 1, 13, 1, 12, 1, 0, 1, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
  176. 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 2, 0, 2, 0, 2, 0, 1,
  177. 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 4,
  178. 0, 5, 2, 4, 2, 0, 4, 2, 4, 6, 4, 0, 2, 5, 0, 2, 0, 5, 2, 4, 0,
  179. 5, 2, 0, 2, 4, 2, 4, 6, 0, 2, 5, 0, 2, 0, 5, 0, 2, 4, 0, 5, 2,
  180. 4, 2, 6, 2, 5, 0, 2, 0, 2, 4, 0, 5, 2, 0, 4, 2, 4, 6, 0, 2, 0,
  181. 2, 4, 0, 5, 2, 0, 2, 4, 2, 4, 6, 2, 5, 0, 2, 0, 5, 0, 2, 0, 5,
  182. 2, 4, 2, 4, 6, 0, 2, 0, 4, 0, 5, 0, 2, 4, 2, 6, 2, 5, 0, 2, 0,
  183. 4, 0, 5, 2, 0, 4, 2, 4, 2, 4, 2, 4, 2, 6, 2, 5, 0, 2, 0, 4, 0,
  184. 5, 0, 2, 4, 2, 4, 6, 0, 2, 0, 2, 0, 4, 0, 5, 6, 2, 4, 2, 4, 2,
  185. 4, 0, 5, 0, 2, 0, 4, 2, 6, 0, 2, 0, 5, 0, 2, 0, 4, 2, 0, 2, 0,
  186. 5, 0, 2, 0, 2, 0, 2, 0, 2, 0, 4, 5, 2, 4, 2, 6, 0, 2, 0, 2, 0,
  187. 2, 0, 5, 0, 2, 4, 2, 0, 6, 4, 2, 5, 0, 5, 0, 4, 2, 5, 2, 5, 0,
  188. 5, 0, 5, 2, 5, 2, 0, 4, 2, 0, 2, 5, 0, 2, 0, 7, 8, 9, 0, 2, 0,
  189. 5, 2, 6, 0, 5, 2, 6, 0, 5, 2, 0, 5, 2, 5, 0, 2, 4, 2, 4, 2, 4,
  190. 2, 6, 2, 0, 2, 0, 2, 0, 2, 0, 5, 2, 4, 2, 4, 2, 4, 2, 0, 5, 0,
  191. 5, 0, 4, 0, 4, 0, 5, 2, 4, 0, 5, 0, 5, 4, 2, 4, 2, 6, 0, 2, 0,
  192. 2, 4, 2, 0, 2, 4, 0, 5, 2, 4, 2, 4, 2, 4, 2, 4, 6, 5, 0, 2, 0,
  193. 2, 4, 0, 5, 4, 2, 4, 2, 6, 4, 5, 0, 5, 0, 5, 0, 2, 4, 2, 4, 2,
  194. 4, 2, 6, 0, 5, 4, 2, 4, 2, 0, 5, 0, 2, 0, 2, 4, 2, 0, 2, 0, 4,
  195. 2, 0, 2, 0, 1, 2, 1, 0, 1, 0, 1, 0, 2, 0, 2, 0, 6, 0, 2, 0, 2,
  196. 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 6, 5, 2, 5, 4, 2, 4, 0,
  197. 5, 0, 5, 0, 5, 0, 5, 0, 4, 0, 5, 4, 6, 0, 2, 0, 5, 0, 2, 0, 5,
  198. 2, 4, 6, 0, 7, 2, 4, 0, 5, 0, 5, 2, 4, 2, 4, 2, 4, 6, 0, 5, 2,
  199. 4, 2, 4, 2, 0, 2, 0, 2, 4, 0, 5, 0, 5, 0, 5, 0, 5, 2, 0, 2, 0,
  200. 2, 0, 2, 0, 2, 0, 5, 4, 2, 4, 0, 4, 6, 0, 5, 0, 5, 0, 5, 0, 4,
  201. 2, 4, 2, 4, 0, 4, 6, 0, 11, 8, 9, 0, 2, 0, 2, 0, 2, 0, 2, 0, 1,
  202. 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 6, 0, 4, 2, 4, 0, 2, 6, 0, 2, 4,
  203. 0, 4, 2, 4, 6, 2, 0, 1, 0, 2, 0, 2, 4, 2, 6, 0, 2, 4, 0, 4, 2,
  204. 4, 6, 0, 2, 4, 2, 4, 2, 6, 2, 0, 4, 2, 0, 2, 4, 2, 0, 4, 2, 1,
  205. 2, 0, 2, 0, 2, 0, 2, 0, 14, 0, 1, 2
  206. ],
  207. true);
  208. }
  209. return /** @type {number} */ (
  210. goog.i18n.GraphemeBreak.inversions_.at(acode));
  211. }
  212. };
  213. /**
  214. * There are two kinds of grapheme clusters: 1) Legacy 2)Extended. This method
  215. * is to check for both using a boolean flag to switch between them.
  216. * @param {number} a The code point value of the first character.
  217. * @param {number} b The code point value of the second character.
  218. * @param {boolean=} opt_extended If true, indicates extended grapheme cluster;
  219. * If false, indicates legacy cluster.
  220. * @return {boolean} True if a & b do not form a cluster; False otherwise.
  221. */
  222. goog.i18n.GraphemeBreak.hasGraphemeBreak = function(a, b, opt_extended) {
  223. var prop_a = goog.i18n.GraphemeBreak.getBreakProp_(a);
  224. var prop_b = goog.i18n.GraphemeBreak.getBreakProp_(b);
  225. var prop = goog.i18n.GraphemeBreak.property;
  226. return goog.i18n.GraphemeBreak.applyLegacyBreakRules_(prop_a, prop_b) &&
  227. !(opt_extended &&
  228. (prop_a == prop.PREPEND || prop_b == prop.SPACING_MARK));
  229. };