123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- /* Copyright 2018 Mozilla Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- const CharacterType = {
- SPACE: 0,
- ALPHA_LETTER: 1,
- PUNCT: 2,
- HAN_LETTER: 3,
- KATAKANA_LETTER: 4,
- HIRAGANA_LETTER: 5,
- HALFWIDTH_KATAKANA_LETTER: 6,
- THAI_LETTER: 7,
- };
- function isAlphabeticalScript(charCode) {
- return charCode < 0x2e80;
- }
- function isAscii(charCode) {
- return (charCode & 0xff80) === 0;
- }
- function isAsciiAlpha(charCode) {
- return (
- (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7a) ||
- (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5a)
- );
- }
- function isAsciiDigit(charCode) {
- return charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39;
- }
- function isAsciiSpace(charCode) {
- return (
- charCode === /* SPACE = */ 0x20 ||
- charCode === /* TAB = */ 0x09 ||
- charCode === /* CR = */ 0x0d ||
- charCode === /* LF = */ 0x0a
- );
- }
- function isHan(charCode) {
- return (
- (charCode >= 0x3400 && charCode <= 0x9fff) ||
- (charCode >= 0xf900 && charCode <= 0xfaff)
- );
- }
- function isKatakana(charCode) {
- return charCode >= 0x30a0 && charCode <= 0x30ff;
- }
- function isHiragana(charCode) {
- return charCode >= 0x3040 && charCode <= 0x309f;
- }
- function isHalfwidthKatakana(charCode) {
- return charCode >= 0xff60 && charCode <= 0xff9f;
- }
- function isThai(charCode) {
- return (charCode & 0xff80) === 0x0e00;
- }
- /**
- * This function is based on the word-break detection implemented in:
- * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp
- */
- function getCharacterType(charCode) {
- if (isAlphabeticalScript(charCode)) {
- if (isAscii(charCode)) {
- if (isAsciiSpace(charCode)) {
- return CharacterType.SPACE;
- } else if (
- isAsciiAlpha(charCode) ||
- isAsciiDigit(charCode) ||
- charCode === /* UNDERSCORE = */ 0x5f
- ) {
- return CharacterType.ALPHA_LETTER;
- }
- return CharacterType.PUNCT;
- } else if (isThai(charCode)) {
- return CharacterType.THAI_LETTER;
- } else if (charCode === /* NBSP = */ 0xa0) {
- return CharacterType.SPACE;
- }
- return CharacterType.ALPHA_LETTER;
- }
- if (isHan(charCode)) {
- return CharacterType.HAN_LETTER;
- } else if (isKatakana(charCode)) {
- return CharacterType.KATAKANA_LETTER;
- } else if (isHiragana(charCode)) {
- return CharacterType.HIRAGANA_LETTER;
- } else if (isHalfwidthKatakana(charCode)) {
- return CharacterType.HALFWIDTH_KATAKANA_LETTER;
- }
- return CharacterType.ALPHA_LETTER;
- }
- export { CharacterType, getCharacterType };
|