| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439 |
- /* Copyright 2012 Mozilla Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- import { warn } from "../shared/util.js";
- // Character types for symbols from 0000 to 00FF.
- // Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
- // prettier-ignore
- const baseTypes = [
- "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "S", "B", "S",
- "WS", "B", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN",
- "BN", "BN", "BN", "BN", "B", "B", "B", "S", "WS", "ON", "ON", "ET",
- "ET", "ET", "ON", "ON", "ON", "ON", "ON", "ES", "CS", "ES", "CS", "CS",
- "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "EN", "CS", "ON",
- "ON", "ON", "ON", "ON", "ON", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "ON", "ON", "ON", "ON", "ON", "ON", "L", "L", "L",
- "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "L", "L", "L", "L", "L", "ON", "ON", "ON", "ON",
- "BN", "BN", "BN", "BN", "BN", "BN", "B", "BN", "BN", "BN", "BN", "BN",
- "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN",
- "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "BN", "CS", "ON", "ET",
- "ET", "ET", "ET", "ON", "ON", "ON", "ON", "L", "ON", "ON", "BN", "ON",
- "ON", "ET", "ET", "EN", "EN", "ON", "L", "ON", "ON", "ON", "EN", "L",
- "ON", "ON", "ON", "ON", "ON", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "ON", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L", "L",
- "L", "L", "L", "L", "L", "ON", "L", "L", "L", "L", "L", "L", "L", "L"
- ];
- // Character types for symbols from 0600 to 06FF.
- // Source: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
- // Note that 061D does not exist in the Unicode standard (see
- // http://unicode.org/charts/PDF/U0600.pdf), so we replace it with an
- // empty string and issue a warning if we encounter this character. The
- // empty string is required to properly index the items after it.
- // prettier-ignore
- const arabicTypes = [
- "AN", "AN", "AN", "AN", "AN", "AN", "ON", "ON", "AL", "ET", "ET", "AL",
- "CS", "AL", "ON", "ON", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM",
- "NSM", "NSM", "NSM", "NSM", "AL", "AL", "", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM",
- "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM",
- "NSM", "NSM", "NSM", "NSM", "AN", "AN", "AN", "AN", "AN", "AN", "AN",
- "AN", "AN", "AN", "ET", "AN", "AN", "AL", "AL", "AL", "NSM", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",
- "AL", "AL", "AL", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "AN",
- "ON", "NSM", "NSM", "NSM", "NSM", "NSM", "NSM", "AL", "AL", "NSM", "NSM",
- "ON", "NSM", "NSM", "NSM", "NSM", "AL", "AL", "EN", "EN", "EN", "EN",
- "EN", "EN", "EN", "EN", "EN", "EN", "AL", "AL", "AL", "AL", "AL", "AL"
- ];
- function isOdd(i) {
- return (i & 1) !== 0;
- }
- function isEven(i) {
- return (i & 1) === 0;
- }
- function findUnequal(arr, start, value) {
- let j, jj;
- for (j = start, jj = arr.length; j < jj; ++j) {
- if (arr[j] !== value) {
- return j;
- }
- }
- return j;
- }
- function setValues(arr, start, end, value) {
- for (let j = start; j < end; ++j) {
- arr[j] = value;
- }
- }
- function reverseValues(arr, start, end) {
- for (let i = start, j = end - 1; i < j; ++i, --j) {
- const temp = arr[i];
- arr[i] = arr[j];
- arr[j] = temp;
- }
- }
- function createBidiText(str, isLTR, vertical = false) {
- let dir = "ltr";
- if (vertical) {
- dir = "ttb";
- } else if (!isLTR) {
- dir = "rtl";
- }
- return { str, dir };
- }
- // These are used in bidi(), which is called frequently. We re-use them on
- // each call to avoid unnecessary allocations.
- const chars = [];
- const types = [];
- function bidi(str, startLevel = -1, vertical = false) {
- let isLTR = true;
- const strLength = str.length;
- if (strLength === 0 || vertical) {
- return createBidiText(str, isLTR, vertical);
- }
- // Get types and fill arrays
- chars.length = strLength;
- types.length = strLength;
- let numBidi = 0;
- let i, ii;
- for (i = 0; i < strLength; ++i) {
- chars[i] = str.charAt(i);
- const charCode = str.charCodeAt(i);
- let charType = "L";
- if (charCode <= 0x00ff) {
- charType = baseTypes[charCode];
- } else if (0x0590 <= charCode && charCode <= 0x05f4) {
- charType = "R";
- } else if (0x0600 <= charCode && charCode <= 0x06ff) {
- charType = arabicTypes[charCode & 0xff];
- if (!charType) {
- warn("Bidi: invalid Unicode character " + charCode.toString(16));
- }
- } else if (0x0700 <= charCode && charCode <= 0x08ac) {
- charType = "AL";
- }
- if (charType === "R" || charType === "AL" || charType === "AN") {
- numBidi++;
- }
- types[i] = charType;
- }
- // Detect the bidi method
- // - If there are no rtl characters then no bidi needed
- // - If less than 30% chars are rtl then string is primarily ltr,
- // unless the string is very short.
- // - If more than 30% chars are rtl then string is primarily rtl
- if (numBidi === 0) {
- isLTR = true;
- return createBidiText(str, isLTR);
- }
- if (startLevel === -1) {
- if (numBidi / strLength < 0.3 && strLength > 4) {
- isLTR = true;
- startLevel = 0;
- } else {
- isLTR = false;
- startLevel = 1;
- }
- }
- const levels = [];
- for (i = 0; i < strLength; ++i) {
- levels[i] = startLevel;
- }
- /*
- X1-X10: skip most of this, since we are NOT doing the embeddings.
- */
- const e = isOdd(startLevel) ? "R" : "L";
- const sor = e;
- const eor = sor;
- /*
- W1. Examine each non-spacing mark (NSM) in the level run, and change the
- type of the NSM to the type of the previous character. If the NSM is at the
- start of the level run, it will get the type of sor.
- */
- let lastType = sor;
- for (i = 0; i < strLength; ++i) {
- if (types[i] === "NSM") {
- types[i] = lastType;
- } else {
- lastType = types[i];
- }
- }
- /*
- W2. Search backwards from each instance of a European number until the
- first strong type (R, L, AL, or sor) is found. If an AL is found, change
- the type of the European number to Arabic number.
- */
- lastType = sor;
- let t;
- for (i = 0; i < strLength; ++i) {
- t = types[i];
- if (t === "EN") {
- types[i] = lastType === "AL" ? "AN" : "EN";
- } else if (t === "R" || t === "L" || t === "AL") {
- lastType = t;
- }
- }
- /*
- W3. Change all ALs to R.
- */
- for (i = 0; i < strLength; ++i) {
- t = types[i];
- if (t === "AL") {
- types[i] = "R";
- }
- }
- /*
- W4. A single European separator between two European numbers changes to a
- European number. A single common separator between two numbers of the same
- type changes to that type:
- */
- for (i = 1; i < strLength - 1; ++i) {
- if (types[i] === "ES" && types[i - 1] === "EN" && types[i + 1] === "EN") {
- types[i] = "EN";
- }
- if (
- types[i] === "CS" &&
- (types[i - 1] === "EN" || types[i - 1] === "AN") &&
- types[i + 1] === types[i - 1]
- ) {
- types[i] = types[i - 1];
- }
- }
- /*
- W5. A sequence of European terminators adjacent to European numbers changes
- to all European numbers:
- */
- for (i = 0; i < strLength; ++i) {
- if (types[i] === "EN") {
- // do before
- for (let j = i - 1; j >= 0; --j) {
- if (types[j] !== "ET") {
- break;
- }
- types[j] = "EN";
- }
- // do after
- for (let j = i + 1; j < strLength; ++j) {
- if (types[j] !== "ET") {
- break;
- }
- types[j] = "EN";
- }
- }
- }
- /*
- W6. Otherwise, separators and terminators change to Other Neutral:
- */
- for (i = 0; i < strLength; ++i) {
- t = types[i];
- if (t === "WS" || t === "ES" || t === "ET" || t === "CS") {
- types[i] = "ON";
- }
- }
- /*
- W7. Search backwards from each instance of a European number until the
- first strong type (R, L, or sor) is found. If an L is found, then change
- the type of the European number to L.
- */
- lastType = sor;
- for (i = 0; i < strLength; ++i) {
- t = types[i];
- if (t === "EN") {
- types[i] = lastType === "L" ? "L" : "EN";
- } else if (t === "R" || t === "L") {
- lastType = t;
- }
- }
- /*
- N1. A sequence of neutrals takes the direction of the surrounding strong
- text if the text on both sides has the same direction. European and Arabic
- numbers are treated as though they were R. Start-of-level-run (sor) and
- end-of-level-run (eor) are used at level run boundaries.
- */
- for (i = 0; i < strLength; ++i) {
- if (types[i] === "ON") {
- const end = findUnequal(types, i + 1, "ON");
- let before = sor;
- if (i > 0) {
- before = types[i - 1];
- }
- let after = eor;
- if (end + 1 < strLength) {
- after = types[end + 1];
- }
- if (before !== "L") {
- before = "R";
- }
- if (after !== "L") {
- after = "R";
- }
- if (before === after) {
- setValues(types, i, end, before);
- }
- i = end - 1; // reset to end (-1 so next iteration is ok)
- }
- }
- /*
- N2. Any remaining neutrals take the embedding direction.
- */
- for (i = 0; i < strLength; ++i) {
- if (types[i] === "ON") {
- types[i] = e;
- }
- }
- /*
- I1. For all characters with an even (left-to-right) embedding direction,
- those of type R go up one level and those of type AN or EN go up two
- levels.
- I2. For all characters with an odd (right-to-left) embedding direction,
- those of type L, EN or AN go up one level.
- */
- for (i = 0; i < strLength; ++i) {
- t = types[i];
- if (isEven(levels[i])) {
- if (t === "R") {
- levels[i] += 1;
- } else if (t === "AN" || t === "EN") {
- levels[i] += 2;
- }
- } else {
- // isOdd
- if (t === "L" || t === "AN" || t === "EN") {
- levels[i] += 1;
- }
- }
- }
- /*
- L1. On each line, reset the embedding level of the following characters to
- the paragraph embedding level:
- segment separators,
- paragraph separators,
- any sequence of whitespace characters preceding a segment separator or
- paragraph separator, and any sequence of white space characters at the end
- of the line.
- */
- // don't bother as text is only single line
- /*
- L2. From the highest level found in the text to the lowest odd level on
- each line, reverse any contiguous sequence of characters that are at that
- level or higher.
- */
- // find highest level & lowest odd level
- let highestLevel = -1;
- let lowestOddLevel = 99;
- let level;
- for (i = 0, ii = levels.length; i < ii; ++i) {
- level = levels[i];
- if (highestLevel < level) {
- highestLevel = level;
- }
- if (lowestOddLevel > level && isOdd(level)) {
- lowestOddLevel = level;
- }
- }
- // now reverse between those limits
- for (level = highestLevel; level >= lowestOddLevel; --level) {
- // find segments to reverse
- let start = -1;
- for (i = 0, ii = levels.length; i < ii; ++i) {
- if (levels[i] < level) {
- if (start >= 0) {
- reverseValues(chars, start, i);
- start = -1;
- }
- } else if (start < 0) {
- start = i;
- }
- }
- if (start >= 0) {
- reverseValues(chars, start, levels.length);
- }
- }
- /*
- L3. Combining marks applied to a right-to-left base character will at this
- point precede their base character. If the rendering engine expects them to
- follow the base characters in the final display process, then the ordering
- of the marks and the base character must be reversed.
- */
- // don't bother for now
- /*
- L4. A character that possesses the mirrored property as specified by
- Section 4.7, Mirrored, must be depicted by a mirrored glyph if the resolved
- directionality of that character is R.
- */
- // don't mirror as characters are already mirrored in the pdf
- // Finally, return string
- for (i = 0, ii = chars.length; i < ii; ++i) {
- const ch = chars[i];
- if (ch === "<" || ch === ">") {
- chars[i] = "";
- }
- }
- return createBidiText(chars.join(""), isLTR);
- }
- export { bidi };
|