Skip to content

Commit 121625d

Browse files
authored
#23 improve performance (#24)
* implement build dfa * feat: add dfa lexer and uncompressed parser table * improve performance * update README * chore: remove unneccesary code
1 parent 8dfd5eb commit 121625d

40 files changed

+45642
-596
lines changed

.github/workflows/rust.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ jobs:
1818
run: cargo build --verbose
1919
- name: Run tests
2020
run: cargo test --verbose
21+
- name: Run tests with features
22+
run: cargo test --verbose --features regex-match
23+
- name: Run benchmarks
24+
run: cargo bench
25+
- name: Run benchmarks with features
26+
run: cargo bench --features regex-match
2127
- name: clippy
2228
uses: actions-rs/clippy-check@v1
2329
with:

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
resolver = "2"
33

44
members = [
5+
"crates/automata",
56
"crates/lexer-generator",
67
"crates/parser-generator",
78
"crates/postgresql-cst-parser",
@@ -11,7 +12,7 @@ members = [
1112
default-members = ["crates/postgresql-cst-parser"]
1213

1314
[workspace.package]
14-
exclude = ["crates/lexer-generator", "crates/parser-generator", "crates/postgresql-cst-parser-wasm"]
15+
exclude = ["crates/automata", "crates/lexer-generator", "crates/parser-generator", "crates/postgresql-cst-parser-wasm"]
1516

1617
[profile.release.package.postgresql-cst-parser-wasm]
1718
opt-level = "s"

README.ja.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- **PostgreSQL 17対応**: 最新のPostgreSQL 17の構文をサポートしています。
1414
- **構造化されたCST出力**: 生成されるCSTは、PostgreSQLの[gram.y](https://github.com/postgres/postgres/blob/REL_17_0/src/backend/parser/gram.y)ファイルで定義された構造に厳密に従います。
1515
- **`cstree`の利用**: 構文木の構築に`cstree`クレートを使用しています。
16+
- **wasm-bindgenとの併用**: Pure Rust で書かれているため、wasm-bindgen と併用できます。
1617
- **PL/pgSQL**: 現在はサポートされていません。
1718

1819
## 開発のモチベーション
@@ -141,7 +142,9 @@ Root@0..31
141142
Semicolon@30..31 ";"
142143
```
143144

144-
このパーサーを実際に体験してみたい場合は、[こちら](https://tanzaku.github.io/postgresql-cst-parser/)で直接試すことができます。
145+
## オンラインデモ
146+
147+
[こちら](https://tanzaku.github.io/postgresql-cst-parser/)でパーサーを直接試すことができます。SQLクエリを入力して、生成された構文木をリアルタイムで確認できます。
145148

146149
## 実装
147150

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- **PostgreSQL 17 Support**: Supports the latest PostgreSQL 17 syntax.
1414
- **Structured CST Output**: The generated CST strictly follows the structure defined in PostgreSQL's [gram.y](https://github.com/postgres/postgres/blob/REL_17_0/src/backend/parser/gram.y) file.
1515
- **Utilizing `cstree`**: Uses the `cstree` crate for building syntax trees.
16+
**Compatible with wasm-bindgen**: Being written in Pure Rust, it can be used with wasm-bindgen for WebAssembly integration.
1617
- **PL/pgSQL**: Currently not supported.
1718

1819
## Development Motivation
@@ -141,7 +142,9 @@ Root@0..31
141142
Semicolon@30..31 ";"
142143
```
143144

144-
If you'd like to try this parser directly, you can experience it online [here](https://tanzaku.github.io/postgresql-cst-parser/).
145+
## Online Demo
146+
147+
You can try the parser directly [here](https://tanzaku.github.io/postgresql-cst-parser/). Enter your SQL query and see the generated syntax tree in real-time.
145148

146149
## Implementation
147150

crates/automata/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[package]
2+
name = "automata"
3+
version = "0.1.0"
4+
edition = "2024"
5+
exclude.workspace = true
6+
7+
[dependencies]
8+
typed-arena = "2.0.2"

crates/automata/src/dfa.rs

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
use std::collections::{BTreeMap, BTreeSet};
2+
3+
use crate::nfa::{NFAState, Transition, collect_epsilon_closure};
4+
5+
pub const INVALID_STATE: usize = !0;
6+
7+
/// Represents a state in a Deterministic Finite Automaton specialized for lexical analysis.
8+
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
9+
pub struct DFAState {
10+
/// Transition table mapping byte values (0-255) to destination state indices.
11+
/// Invalid transitions are marked as !0 (usize::MAX).
12+
pub transitions: [usize; 256],
13+
14+
/// ID of the lexical rule that accepts this state.
15+
/// Lower values indicate higher priority rules (those appearing earlier in the flex definition).
16+
/// When multiple rules can match the same input, the rule with the lowest ID is selected.
17+
pub accept_lexer_rule_id: Option<u32>,
18+
}
19+
20+
pub struct DFA {
21+
pub states: Vec<DFAState>,
22+
}
23+
24+
impl<'a> DFA {
25+
pub fn match_bytes(&self, bs: &[u8]) -> Option<u32> {
26+
let mut state = 0;
27+
28+
// The initial state may be an accepting state, as with patterns like '.*'
29+
let mut accepted = self.states[state].accept_lexer_rule_id;
30+
31+
for &b in bs {
32+
let next_state = self.states[state].transitions[b as usize];
33+
if next_state == INVALID_STATE {
34+
break;
35+
}
36+
37+
if self.states[next_state].accept_lexer_rule_id.is_some() {
38+
accepted = self.states[next_state].accept_lexer_rule_id;
39+
}
40+
state = next_state;
41+
}
42+
43+
// After processing all input, check for an EOF transition
44+
// EOF is represented as 0 (byte value) for simplicity, similar to null-terminated strings
45+
let next_state = self.states[state].transitions[0];
46+
if next_state != INVALID_STATE && self.states[next_state].accept_lexer_rule_id.is_some() {
47+
accepted = self.states[next_state].accept_lexer_rule_id;
48+
}
49+
50+
accepted
51+
}
52+
53+
pub fn match_string(&self, s: &str) -> Option<u32> {
54+
self.match_bytes(s.as_bytes())
55+
}
56+
}
57+
58+
impl<'a> From<&'a NFAState<'a>> for DFA {
59+
/// Subset construction algorithm
60+
fn from(start_state: &'a NFAState<'a>) -> Self {
61+
construct_dfa_with_state_mapping(start_state, None)
62+
}
63+
}
64+
65+
/// Selects the highest priority lexical rule (lowest ID) from all accepting states.
66+
/// This implements Flex-style rule selection semantics, where earlier rules in the
67+
/// definition file have higher priority when multiple rules match the same input.
68+
fn select_highest_priority_rule(set: &BTreeSet<&NFAState>) -> Option<u32> {
69+
set.iter()
70+
.fold(None, |acc, s| match *s.accept_lexer_rule_id.borrow() {
71+
Some(v) if v < acc.unwrap_or(u32::MAX) => Some(v),
72+
_ => acc,
73+
})
74+
}
75+
76+
/// Subset construction algorithm
77+
pub fn construct_dfa_with_state_mapping<'a>(
78+
start_state: &'a NFAState<'a>,
79+
mut dfa_to_nfa: Option<&mut Vec<Vec<usize>>>,
80+
) -> DFA {
81+
let mut first_set = BTreeSet::new();
82+
collect_epsilon_closure(&mut first_set, start_state);
83+
84+
if let Some(dfa_to_nfa) = dfa_to_nfa.as_mut() {
85+
dfa_to_nfa.push(first_set.iter().map(|s| s.state_id).collect());
86+
}
87+
88+
let mut nfa_map = BTreeMap::new();
89+
nfa_map.insert(first_set.clone(), 0);
90+
91+
let mut dfa_states = Vec::new();
92+
dfa_states.push(DFAState {
93+
transitions: [INVALID_STATE; 256],
94+
accept_lexer_rule_id: select_highest_priority_rule(&first_set),
95+
});
96+
97+
let mut nfa_states_vec = vec![first_set];
98+
99+
while let Some(nfa_states) = nfa_states_vec.pop() {
100+
let src_state_index = *nfa_map.get(&nfa_states).unwrap();
101+
102+
for b in 0..=255 {
103+
let mut next_set = BTreeSet::new();
104+
105+
let t = Transition::Char(b);
106+
107+
for nfa_state in &nfa_states {
108+
if let Some(new_states) = nfa_state.transitions.borrow().get(&t) {
109+
for &new_state in new_states {
110+
collect_epsilon_closure(&mut next_set, new_state);
111+
}
112+
}
113+
}
114+
115+
if next_set.is_empty() {
116+
continue;
117+
}
118+
119+
let dst_state_index = if let Some(index) = nfa_map.get(&next_set) {
120+
*index
121+
} else {
122+
let index = nfa_map.len();
123+
124+
let accept_lexer_rule_id = select_highest_priority_rule(&next_set);
125+
126+
if let Some(dfa_to_nfa) = dfa_to_nfa.as_mut() {
127+
dfa_to_nfa.push(next_set.iter().map(|s| s.state_id).collect());
128+
}
129+
130+
nfa_map.insert(next_set.clone(), index);
131+
nfa_states_vec.push(next_set);
132+
133+
dfa_states.push(DFAState {
134+
transitions: [!0; 256],
135+
accept_lexer_rule_id,
136+
});
137+
138+
index
139+
};
140+
141+
dfa_states[src_state_index].transitions[b as usize] = dst_state_index;
142+
}
143+
}
144+
145+
DFA { states: dfa_states }
146+
}
147+
148+
#[cfg(test)]
149+
mod tests {
150+
use crate::nfa::NFA;
151+
152+
use super::*;
153+
154+
#[test]
155+
fn test_simple1() {
156+
let nfa = NFA::new();
157+
158+
// a(b|c)*d
159+
let start = nfa.new_state();
160+
let s_a1 = nfa.new_state();
161+
let s_a2 = nfa.new_state();
162+
let s_bc1 = nfa.new_state();
163+
let s_bc2 = nfa.new_state();
164+
let s_b1 = nfa.new_state();
165+
let s_b2 = nfa.new_state();
166+
let s_c1 = nfa.new_state();
167+
let s_c2 = nfa.new_state();
168+
let s_kleene1 = nfa.new_state();
169+
let s_kleene2 = nfa.new_state();
170+
let s_d1 = nfa.new_state();
171+
let s_d2 = nfa.new_state();
172+
173+
// Concatenation
174+
s_a1.add_transition(s_a2, Transition::Char('a' as u8));
175+
176+
// Alternation
177+
s_bc1.add_transition(s_b1, Transition::Epsilon);
178+
s_bc1.add_transition(s_c1, Transition::Epsilon);
179+
s_b1.add_transition(s_b2, Transition::Char('b' as u8));
180+
s_c1.add_transition(s_c2, Transition::Char('c' as u8));
181+
s_b2.add_transition(s_bc2, Transition::Epsilon);
182+
s_c2.add_transition(s_bc2, Transition::Epsilon);
183+
184+
// Kleene closure (zero or more repetitions)
185+
s_kleene1.add_transition(s_kleene2, Transition::Epsilon);
186+
s_kleene1.add_transition(s_bc1, Transition::Epsilon);
187+
s_bc2.add_transition(s_kleene2, Transition::Epsilon);
188+
s_bc2.add_transition(s_bc1, Transition::Epsilon);
189+
190+
// Concatenation
191+
s_d1.add_transition(s_d2, Transition::Char('d' as u8));
192+
193+
// Connecting the components
194+
start.add_transition(s_a1, Transition::Epsilon);
195+
s_a2.add_transition(s_bc1, Transition::Epsilon);
196+
s_bc2.add_transition(s_d1, Transition::Epsilon);
197+
198+
s_d2.set_accept(0);
199+
200+
assert!(nfa.matches_string(start, "abd"));
201+
assert!(nfa.matches_string(start, "acd"));
202+
assert!(nfa.matches_string(start, "abccbd"));
203+
assert!(!nfa.matches_string(start, "ad"));
204+
205+
let dfa = DFA::from(start);
206+
assert_eq!(dfa.match_string("abd"), Some(0));
207+
assert_eq!(dfa.match_string("acd"), Some(0));
208+
assert_eq!(dfa.match_string("abccbd"), Some(0));
209+
assert_eq!(dfa.match_string("ad"), None);
210+
}
211+
}

0 commit comments

Comments
 (0)