Skip to content

Commit 800413b

Browse files
committed
Added lexer package with a general purpose string tokenizer
1 parent 4f0f57c commit 800413b

File tree

6 files changed

+353
-0
lines changed

6 files changed

+353
-0
lines changed

deno.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"@stdext/crypto": "jsr:@stdext/crypto",
99
"@stdext/encoding": "jsr:@stdext/encoding",
1010
"@stdext/http": "jsr:@stdext/http",
11+
"@stdext/lexer": "jsr:@stdext/lexer",
1112
"@stdext/types": "jsr:@stdext/types"
1213
},
1314
"tasks": {
@@ -26,6 +27,7 @@
2627
"./crypto",
2728
"./encoding",
2829
"./http",
30+
"./lexer",
2931
"./types"
3032
],
3133
"exclude": [

lexer/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# @stdext/lexer
2+
3+
The lexer package contains general purpose lexers/tokenizers

lexer/deno.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"version": "0.0.1",
3+
"name": "@stdext/lexer",
4+
"exports": {
5+
".": "./mod.ts",
6+
"./string_tokenizer": "./string_tokenizer.ts"
7+
}
8+
}

lexer/mod.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export * from "./string_tokenizer.ts";

lexer/string_tokenizer.test.ts

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import { StringTokenizer } from "./string_tokenizer.ts";
2+
import { assertEquals } from "@std/assert";
3+
4+
Deno.test("StringTokenizer > can tokenize empty stiring and matcher", () => {
5+
const data = "";
6+
const t = new StringTokenizer({
7+
data: data,
8+
matchers: [],
9+
});
10+
11+
const tokens = t.tokenize();
12+
13+
assertEquals(tokens, []);
14+
});
15+
16+
Deno.test("StringTokenizer > can tokenize", () => {
17+
const data = "test";
18+
const t = new StringTokenizer({
19+
data: data,
20+
matchers: [
21+
{
22+
key: (v, i) => v === "t" && i === 3,
23+
handler: (v, i) => ({
24+
index: i,
25+
type: "function",
26+
value: v,
27+
}),
28+
},
29+
{
30+
key: "t",
31+
handler: (v, i) => ({
32+
index: i,
33+
type: "string",
34+
value: v,
35+
}),
36+
},
37+
{
38+
key: /[es]/,
39+
handler: (v, i) => ({
40+
index: i,
41+
type: "regex",
42+
value: v,
43+
}),
44+
},
45+
],
46+
});
47+
48+
const tokens = t.tokenize();
49+
50+
assertEquals(tokens, [
51+
{
52+
index: 0,
53+
type: "string",
54+
value: "t",
55+
},
56+
{
57+
index: 1,
58+
type: "regex",
59+
value: "e",
60+
},
61+
{
62+
index: 2,
63+
type: "regex",
64+
value: "s",
65+
},
66+
{
67+
index: 3,
68+
type: "function",
69+
value: "t",
70+
},
71+
]);
72+
});
73+
74+
Deno.test("StringTokenizer > can tokenize with default handler", () => {
75+
const data = "test";
76+
const t = new StringTokenizer({
77+
data: data,
78+
matchers: [],
79+
defaultHandler: (v, i) => ({
80+
index: i,
81+
type: "default",
82+
value: v,
83+
}),
84+
});
85+
86+
const tokens = t.tokenize();
87+
88+
assertEquals(tokens, [
89+
{
90+
index: 0,
91+
type: "default",
92+
value: "t",
93+
},
94+
{
95+
index: 1,
96+
type: "default",
97+
value: "e",
98+
},
99+
{
100+
index: 2,
101+
type: "default",
102+
value: "s",
103+
},
104+
{
105+
index: 3,
106+
type: "default",
107+
value: "t",
108+
},
109+
]);
110+
});
111+
112+
Deno.test("StringTokenizer > can tokenize with custom index increase", () => {
113+
const data = "test";
114+
const t = new StringTokenizer({
115+
data: data,
116+
matchers: [],
117+
defaultHandler: (v, i) => [{
118+
index: i,
119+
type: "default",
120+
value: v,
121+
}, 2],
122+
});
123+
124+
const tokens = t.tokenize();
125+
126+
assertEquals(tokens, [
127+
{
128+
index: 0,
129+
type: "default",
130+
value: "t",
131+
},
132+
{
133+
index: 2,
134+
type: "default",
135+
value: "s",
136+
},
137+
]);
138+
});

lexer/string_tokenizer.ts

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/**
2+
* The token object
3+
*/
4+
export type StringTokenizerToken<Type = string, Value = string> = {
5+
/**
6+
* A token type
7+
*/
8+
type: Type;
9+
/**
10+
* The token value
11+
*/
12+
value: Value;
13+
/**
14+
* The index where the token starts
15+
*/
16+
index: number;
17+
};
18+
19+
/**
20+
* Key for the matcher, takes a string, regex or a function
21+
*/
22+
export type StringTokenizerKeyMatcher =
23+
| string
24+
| RegExp
25+
| ((value: string, index: number) => boolean);
26+
27+
/**
28+
* The return type can be either a token or an touple with the token as the
29+
* first value andthe amount to increment the index with as the second argument
30+
*
31+
* @example token
32+
* ```ts
33+
* { type: "someType", value: "a", index: 5 }
34+
* ```
35+
*
36+
* @example tuple with token and index, index will be incremented by 3
37+
* ```ts
38+
* [{ type: "someType", value: "abc", index: 5 }, 3]
39+
* ```
40+
*/
41+
export type StringTokenizerHandlerReturnType<Type = string, Value = string> =
42+
| StringTokenizerToken<Type, Value>
43+
| [StringTokenizerToken<Type, Value>]
44+
| [StringTokenizerToken<Type, Value>, number | undefined];
45+
46+
/**
47+
* Handler for the matched token
48+
*
49+
* @param value the current value at the given index
50+
* @param index the current index
51+
* @param data a clone of the full data string
52+
*/
53+
export type StringTokenizerHandler<Type = string, Value = string> = (
54+
value: string,
55+
index: number,
56+
data: string,
57+
) => StringTokenizerHandlerReturnType<Type, Value>;
58+
59+
/**
60+
* Matcher object that contains a key to match the value against and a handler
61+
*/
62+
export type StringTokenizerMatcher<Type = string, Value = string> = {
63+
key: StringTokenizerKeyMatcher;
64+
handler: StringTokenizerHandler<Type, Value>;
65+
};
66+
export type StringTokenizerOptions<Type = string, Value = string> = {
67+
/**
68+
* The data to tokenize
69+
*/
70+
data: string;
71+
/**
72+
* The matchers, will be checked in order
73+
*/
74+
matchers: StringTokenizerMatcher<Type, Value>[];
75+
/**
76+
* A default handler in case no matcher matches
77+
*/
78+
defaultHandler?: StringTokenizerHandler<Type, Value>;
79+
};
80+
81+
/**
82+
* General purpose string tokenizer
83+
*
84+
* @example
85+
* ```ts
86+
* const t = new StringTokenizer({
87+
* data: "testa",
88+
* matchers: [
89+
* {
90+
* key: (v, i) => v === "t" && i === 3,
91+
* handler: (v, i) => ({
92+
* index: i,
93+
* type: "function",
94+
* value: v,
95+
* }),
96+
* },
97+
* {
98+
* key: "t",
99+
* handler: (v, i) => ([
100+
* {
101+
* index: i,
102+
* type: "string",
103+
* value: v,
104+
* },
105+
* 2
106+
* ]),
107+
* },
108+
* {
109+
* key: /[es]/,
110+
* handler: (v, i) => ({
111+
* index: i,
112+
* type: "regex",
113+
* value: v,
114+
* }),
115+
* },
116+
* ],
117+
* defualtHandler:(v, i) => ({
118+
* index: i,
119+
* type: "default",
120+
* value: v,
121+
* }),
122+
* });
123+
*
124+
* const tokens = t.tokenize();
125+
* ```
126+
*/
127+
export class StringTokenizer<Type = string, Value = string> {
128+
readonly #data: string;
129+
readonly #matchers: StringTokenizerMatcher<Type, Value>[];
130+
readonly #defaultHandler?: StringTokenizerHandler<Type, Value>;
131+
132+
#index = 0;
133+
134+
get #currentChar(): string {
135+
return this.#data[this.#index];
136+
}
137+
138+
constructor(options: StringTokenizerOptions<Type, Value>) {
139+
this.#data = options.data;
140+
this.#matchers = options.matchers;
141+
this.#defaultHandler = options.defaultHandler;
142+
}
143+
144+
#incrementIndex(value = 1): void {
145+
this.#index += value;
146+
}
147+
148+
tokenize(): StringTokenizerToken<Type, Value>[] {
149+
this.#index = 0;
150+
const tokens: StringTokenizerToken<Type, Value>[] = [];
151+
152+
while (this.#index < this.#data.length) {
153+
const token = this.#match();
154+
let increment = 1;
155+
156+
if (Array.isArray(token)) {
157+
tokens.push(token[0]);
158+
if (token[1] !== undefined) {
159+
increment = token[1];
160+
}
161+
} else {
162+
tokens.push(token);
163+
}
164+
165+
this.#incrementIndex(increment);
166+
}
167+
168+
return tokens;
169+
}
170+
171+
#match(): StringTokenizerHandlerReturnType<Type, Value> {
172+
for (const matcher of this.#matchers) {
173+
if (
174+
typeof matcher.key === "string" &&
175+
matcher.key === this.#currentChar ||
176+
matcher.key instanceof RegExp &&
177+
matcher.key.test(this.#currentChar) ||
178+
typeof matcher.key === "function" &&
179+
matcher.key(this.#currentChar, this.#index)
180+
) {
181+
return matcher.handler(
182+
this.#currentChar,
183+
this.#index,
184+
this.#data,
185+
) as StringTokenizerHandlerReturnType<Type, Value>;
186+
}
187+
}
188+
189+
if (this.#defaultHandler) {
190+
return this.#defaultHandler(
191+
this.#currentChar,
192+
this.#index,
193+
this.#data,
194+
) as StringTokenizerHandlerReturnType<Type, Value>;
195+
}
196+
197+
throw new Error(
198+
`No matchers matched the value '${this.#currentChar}', and a default handler was not set.`,
199+
);
200+
}
201+
}

0 commit comments

Comments
 (0)