Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions packages/cspell-trie-lib/perf/Utf8.perf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@ import { Buffer } from 'node:buffer';
import { suite } from 'perf-insight';

import {
decodeUtf8_32,
decodeUtf8_32Rev,
decodeUtf8ByteStream,
decodeUtf8N_BE,
decodeUtf8N_LE,
encodeCodePointsToUtf8Into,
encodeTextToUtf8,
encodeTextToUtf8_32,
encodeTextToUtf8_32Into,
encodeTextToUtf8Into,
encodeUtf8N_BE,
encodeUtf8N_LE,
encodeToUtf8_32,
encodeToUtf8_32Rev,
textToCodePoints,
} from '../src/lib/TrieBlob/Utf8.ts';
import { Utf8Encoder, Utf8Encoder2 } from '../src/lib/TrieBlob/Utf8Encoder.ts';
Expand Down Expand Up @@ -270,8 +270,8 @@ suite('Utf8 encode/decode', async (test) => {
for (let i = iterations; i > 0; --i) {
for (const char of chars) {
const cp = char.codePointAt(0) || 0;
const u8 = encodeUtf8N_BE(cp);
const dcp = decodeUtf8N_BE(u8);
const u8 = encodeToUtf8_32(cp);
const dcp = decodeUtf8_32(u8);
String.fromCodePoint(dcp);
}
}
Expand All @@ -281,8 +281,8 @@ suite('Utf8 encode/decode', async (test) => {
for (let i = iterations; i > 0; --i) {
for (const char of chars) {
const cp = char.codePointAt(0) || 0;
const u8 = encodeUtf8N_LE(cp);
const dcp = decodeUtf8N_LE(u8);
const u8 = encodeToUtf8_32Rev(cp);
const dcp = decodeUtf8_32Rev(u8);
String.fromCodePoint(dcp);
}
}
Expand Down
6 changes: 3 additions & 3 deletions packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.ts';
import { encodeTextToUtf8, encodeToUtf8_32, type Utf8_32 } from './Utf8.ts';

export type Utf8Seq = Readonly<number[]>;

export type CharIndexMap = Map<string, Utf8BE32>;
export type CharIndexMap = Map<string, Utf8_32>;

export type RO_CharIndexMap = Readonly<CharIndexMap>;

Expand Down Expand Up @@ -103,7 +103,7 @@ export class CharIndexBuilder {
}
const nc = c.normalize('NFC');
this.charIndex.add(nc);
const utf8 = encodeUtf8N_BE(nc.codePointAt(0) || 0);
const utf8 = encodeToUtf8_32(nc.codePointAt(0) || 0);
this.charIndexMap.set(c, utf8);
this.charIndexMap.set(nc, utf8);
this.charIndexMap.set(c.normalize('NFD'), utf8);
Expand Down
27 changes: 14 additions & 13 deletions packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import type { FastTrieBlobBitMaskInfo } from './FastTrieBlobBitMaskInfo.ts';
import { FastTrieBlobInternals, sortNodes } from './FastTrieBlobInternals.ts';
import { resolveMap } from './resolveMap.ts';
import { TrieBlob } from './TrieBlob.ts';
import { encodeToUtf8_32Rev } from './Utf8.ts';

type FastTrieBlobNode = number[];

Expand Down Expand Up @@ -106,24 +107,24 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
let nodeIdx = 0;
let depth = 0;

const insertChar = (char: string) => {
function insertChar(char: string) {
if (!nodes[nodeIdx]) {
refNodes.push(nodeIdx);
}
// console.warn('insertChar %o', { nodeIdx, depth, char });
const pDepth = depth;
const utf8Seq = this.letterToUtf8Seq(char);
for (let i = 0; i < utf8Seq.length; ++i) {
insertCharIndexes(utf8Seq[i], pDepth);

for (let encoded = encodeToUtf8_32Rev(char.codePointAt(0) || 0); encoded; encoded >>>= 8) {
insertCharIndexes(encoded & 0xff, pDepth);
}
};
}

/**
* A single character can result in multiple nodes being created
* because it takes multiple bytes to represent a character.
* @param seq - partial character index.
*/
const insertCharIndexes = (seq: number, pDepth: number) => {
function insertCharIndexes(seq: number, pDepth: number) {
// console.warn('i %o at %o', char, nodeIdx);
if (nodes[nodeIdx] && Object.isFrozen(nodes[nodeIdx])) {
nodeIdx = nodes.push([...nodes[nodeIdx]]) - 1;
Expand All @@ -148,9 +149,9 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
stack[depth] = { nodeIdx, pos, pDepth };
}
nodeIdx = childIdx;
};
}

const markEOW = () => {
function markEOW() {
// console.warn('$');
if (nodeIdx === eow) return;
const node = nodes[nodeIdx];
Expand All @@ -164,9 +165,9 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
node[0] |= NodeMaskEOW;
}
nodeIdx = eow;
};
}

const reference = (refId: number) => {
function reference(refId: number) {
const refNodeIdx = refNodes[refId];
assert(refNodeIdx !== undefined);
// console.warn('r %o', { refId, nodeIdx, refNodeIdx, depth });
Expand All @@ -178,17 +179,17 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
const pos = s.pos;
const node = nodes[nodeIdx];
node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);
};
}

const backStep = (num: number) => {
function backStep(num: number) {
if (!num) return;
// console.warn('<< %o', num);
assert(num <= depth && num > 0);
for (let n = num; n > 0; --n) {
depth = stack[depth].pDepth;
}
nodeIdx = stack[depth + 1].nodeIdx;
};
}

const c: BuilderCursor = {
insertChar,
Expand Down
9 changes: 3 additions & 6 deletions packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { endianness } from '../utils/endian.ts';
import { mergeOptionalWithDefaults } from '../utils/mergeOptionalWithDefaults.ts';
import { decodeTrieBlobToBTrie, encodeTrieBlobToBTrie } from './TrieBlobEncoder.ts';
import { TrieBlobInternals, TrieBlobIRoot } from './TrieBlobIRoot.ts';
import { encodeTextToUtf8_32, Utf8Accumulator } from './Utf8.ts';
import { encodeTextToUtf8_32Rev, Utf8Accumulator } from './Utf8.ts';

const NodeHeaderNumChildrenBits = 8 as const;
const NodeHeaderNumChildrenShift = 0 as const;
Expand Down Expand Up @@ -143,15 +143,12 @@ export class TrieBlob implements TrieData {
const _nodes8 = this.#nodes8;

for (; p.offset < p.text.length; ) {
const code = encodeTextToUtf8_32(p);

const nodes = _nodes;
const nodes8 = _nodes8;
let node = nodes[nodeIdx];
let s = (p.bytes - 1) * 8;

for (let mask = 0xff << s; mask; mask >>>= 8, s -= 8) {
const charVal = (code & mask) >>> s;
for (let code = encodeTextToUtf8_32Rev(p); code; code >>>= 8) {
const charVal = code & 0xff;
const count = node & 0xff; // TrieBlob.NodeMaskNumChildren
const idx4 = nodeIdx << 2;
// Binary search for the character in the child nodes.
Expand Down
32 changes: 16 additions & 16 deletions packages/cspell-trie-lib/src/lib/TrieBlob/Utf8.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ import assert from 'node:assert';
import { describe, expect, test } from 'vitest';

import {
decodeUtf8_32,
decodeUtf8_32_StreamToString,
decodeUtf8_32Rev,
decodeUtf8ByteStream,
decodeUtf8N_BE,
decodeUtf8N_BE_StreamToString,
decodeUtf8N_LE,
encodeCodePointsToUtf8Into,
encodeTextToUtf8,
encodeTextToUtf8_32Into,
encodeUtf8N_BE,
encodeUtf8N_LE,
encodeToUtf8_32,
encodeToUtf8_32Rev,
hex32,
textToCodePoints,
Utf8Accumulator,
Expand All @@ -35,16 +35,16 @@ describe('Utf8 lib', () => {
encoder.encodeInto(char, buf8);
const expectedUtf8_BE = extractUtf8BE(view);

const utf8BE = encodeUtf8N_BE(codePoint);
const utf8BE = encodeToUtf8_32(codePoint);

expect(utf8BE).toBe(expectedUtf8_BE);
expect(decodeUtf8N_BE(utf8BE)).toBe(codePoint);
expect(decodeUtf8_32(utf8BE)).toBe(codePoint);

const expectedUtf8_LE = view.getUint32(0, true);
const utf8LE = encodeUtf8N_LE(codePoint);
const utf8LE = encodeToUtf8_32Rev(codePoint);

expect(utf8LE).toBe(expectedUtf8_LE);
expect(decodeUtf8N_LE(utf8LE)).toBe(codePoint);
expect(decodeUtf8_32Rev(utf8LE)).toBe(codePoint);
}
});

Expand Down Expand Up @@ -79,14 +79,14 @@ describe('Utf8 lib', () => {
${'é'} | ${[0xc3a9]}
${'🇺🇸'} | ${[0xf09f_87ba, 0xf09f_87b8]}
`('encodeUtf8N_BE $text', ({ text, expected }) => {
const utf = textToCodePoints(text).map((cp) => encodeUtf8N_BE(cp));
const utf = textToCodePoints(text).map((cp) => encodeToUtf8_32(cp));
expect(utf).toEqual(expected);
expect(
String.fromCodePoint(
...utf
.map((v) => v ^ ~1) // force it to be native
.map((v) => v ^ ~1)
.map((c) => decodeUtf8N_BE(c)),
.map((c) => decodeUtf8_32(c)),
),
).toEqual(text);
});
Expand All @@ -104,11 +104,11 @@ describe('Utf8 lib', () => {
});

test('decodeUtf8N_BE invalid', () => {
expect(decodeUtf8N_BE(0xff)).toBe(0xfffd);
expect(decodeUtf8_32(0xff)).toBe(0xfffd);
});

test('decodeUtf8N_LE invalid', () => {
expect(decodeUtf8N_LE(0xff)).toBe(0xfffd);
expect(decodeUtf8_32Rev(0xff)).toBe(0xfffd);
});

test.each`
Expand All @@ -119,14 +119,14 @@ describe('Utf8 lib', () => {
${'ë'} | ${[0xabc3]}
${'🇺🇸'} | ${[0xba87_9ff0, 0xb887_9ff0]}
`('encodeUtf8N_LE $text', ({ text, expected }) => {
const utf = textToCodePoints(text).map((cp) => encodeUtf8N_LE(cp));
const utf = textToCodePoints(text).map((cp) => encodeToUtf8_32Rev(cp));
expect(utf).toEqual(expected);
expect(
String.fromCodePoint(
...utf
.map((v) => v ^ ~1) // force it to be native
.map((v) => v ^ ~1)
.map((c) => decodeUtf8N_LE(c)),
.map((c) => decodeUtf8_32Rev(c)),
),
).toEqual(text);
});
Expand All @@ -147,7 +147,7 @@ describe('Utf8 lib', () => {
const len = encodeTextToUtf8_32Into(text, buffer);

expect(buffer.length).toBe(len);
expect(decodeUtf8N_BE_StreamToString(buffer)).toBe(text);
expect(decodeUtf8_32_StreamToString(buffer)).toBe(text);
});
});

Expand Down
Loading
Loading