pdf_find_utils.js 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. /* Copyright 2018 Mozilla Foundation
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. const CharacterType = {
  16. SPACE: 0,
  17. ALPHA_LETTER: 1,
  18. PUNCT: 2,
  19. HAN_LETTER: 3,
  20. KATAKANA_LETTER: 4,
  21. HIRAGANA_LETTER: 5,
  22. HALFWIDTH_KATAKANA_LETTER: 6,
  23. THAI_LETTER: 7,
  24. };
  25. function isAlphabeticalScript(charCode) {
  26. return charCode < 0x2e80;
  27. }
  28. function isAscii(charCode) {
  29. return (charCode & 0xff80) === 0;
  30. }
  31. function isAsciiAlpha(charCode) {
  32. return (
  33. (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7a) ||
  34. (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5a)
  35. );
  36. }
  37. function isAsciiDigit(charCode) {
  38. return charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39;
  39. }
  40. function isAsciiSpace(charCode) {
  41. return (
  42. charCode === /* SPACE = */ 0x20 ||
  43. charCode === /* TAB = */ 0x09 ||
  44. charCode === /* CR = */ 0x0d ||
  45. charCode === /* LF = */ 0x0a
  46. );
  47. }
  48. function isHan(charCode) {
  49. return (
  50. (charCode >= 0x3400 && charCode <= 0x9fff) ||
  51. (charCode >= 0xf900 && charCode <= 0xfaff)
  52. );
  53. }
  54. function isKatakana(charCode) {
  55. return charCode >= 0x30a0 && charCode <= 0x30ff;
  56. }
  57. function isHiragana(charCode) {
  58. return charCode >= 0x3040 && charCode <= 0x309f;
  59. }
  60. function isHalfwidthKatakana(charCode) {
  61. return charCode >= 0xff60 && charCode <= 0xff9f;
  62. }
  63. function isThai(charCode) {
  64. return (charCode & 0xff80) === 0x0e00;
  65. }
  66. /**
  67. * This function is based on the word-break detection implemented in:
  68. * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp
  69. */
  70. function getCharacterType(charCode) {
  71. if (isAlphabeticalScript(charCode)) {
  72. if (isAscii(charCode)) {
  73. if (isAsciiSpace(charCode)) {
  74. return CharacterType.SPACE;
  75. } else if (
  76. isAsciiAlpha(charCode) ||
  77. isAsciiDigit(charCode) ||
  78. charCode === /* UNDERSCORE = */ 0x5f
  79. ) {
  80. return CharacterType.ALPHA_LETTER;
  81. }
  82. return CharacterType.PUNCT;
  83. } else if (isThai(charCode)) {
  84. return CharacterType.THAI_LETTER;
  85. } else if (charCode === /* NBSP = */ 0xa0) {
  86. return CharacterType.SPACE;
  87. }
  88. return CharacterType.ALPHA_LETTER;
  89. }
  90. if (isHan(charCode)) {
  91. return CharacterType.HAN_LETTER;
  92. } else if (isKatakana(charCode)) {
  93. return CharacterType.KATAKANA_LETTER;
  94. } else if (isHiragana(charCode)) {
  95. return CharacterType.HIRAGANA_LETTER;
  96. } else if (isHalfwidthKatakana(charCode)) {
  97. return CharacterType.HALFWIDTH_KATAKANA_LETTER;
  98. }
  99. return CharacterType.ALPHA_LETTER;
  100. }
  101. export { CharacterType, getCharacterType };