Skip to content

Commit b4445d5

Browse files
author
Альберт Скальт
committed
add iterator over tokens in Tokenizer
This patch adds an ability to iterate over recognized tokens converting `Tokenizer` to the iterator. It allows to perform token mapping with single pass, instead of mapping a resulting vector with the additional loop.
1 parent 153e7c5 commit b4445d5

File tree

1 file changed

+104
-17
lines changed

1 file changed

+104
-17
lines changed

src/tokenizer.rs

Lines changed: 104 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,26 @@ pub struct Tokenizer<'a> {
857857
unescape: bool,
858858
}
859859

860+
/// Passed into [`Tokenizer::next_token`] as in some situations tokenization
861+
/// is context dependent. The separate enum is used to be able to not clone
862+
/// the previous token during [`TokenWithLocationIter`] iteration.
863+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
864+
enum PrevTokenKind {
865+
Word,
866+
Period,
867+
Other,
868+
}
869+
870+
impl From<&Token> for PrevTokenKind {
871+
fn from(value: &Token) -> Self {
872+
match value {
873+
Token::Word(_) => Self::Word,
874+
Token::Period => Self::Period,
875+
_ => Self::Other,
876+
}
877+
}
878+
}
879+
860880
impl<'a> Tokenizer<'a> {
861881
/// Create a new SQL tokenizer for the specified SQL statement
862882
///
@@ -917,6 +937,23 @@ impl<'a> Tokenizer<'a> {
917937
self
918938
}
919939

940+
/// Return an iterator over tokens
941+
pub fn iter(&mut self) -> TokenWithSpanIter<'a, '_> {
942+
let state = State {
943+
peekable: self.query.chars().peekable(),
944+
line: 1,
945+
col: 1,
946+
};
947+
948+
let location = state.location();
949+
TokenWithSpanIter {
950+
state,
951+
location,
952+
tokenizer: self,
953+
prev_token_kind: None,
954+
}
955+
}
956+
920957
/// Tokenize the statement and produce a vector of tokens
921958
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
922959
let twl = self.tokenize_with_location()?;
@@ -936,19 +973,8 @@ impl<'a> Tokenizer<'a> {
936973
&mut self,
937974
buf: &mut Vec<TokenWithSpan>,
938975
) -> Result<(), TokenizerError> {
939-
let mut state = State {
940-
peekable: self.query.chars().peekable(),
941-
line: 1,
942-
col: 1,
943-
};
944-
945-
let mut location = state.location();
946-
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
947-
let span = location.span_to(state.location());
948-
949-
buf.push(TokenWithSpan { token, span });
950-
951-
location = state.location();
976+
for token in self.iter() {
977+
buf.push(token?);
952978
}
953979
Ok(())
954980
}
@@ -983,7 +1009,7 @@ impl<'a> Tokenizer<'a> {
9831009
fn next_token(
9841010
&self,
9851011
chars: &mut State,
986-
prev_token: Option<&Token>,
1012+
prev_token_kind: Option<PrevTokenKind>,
9871013
) -> Result<Option<Token>, TokenizerError> {
9881014
match chars.peek() {
9891015
Some(&ch) => match ch {
@@ -1263,7 +1289,7 @@ impl<'a> Tokenizer<'a> {
12631289
// if the prev token is not a word, then this is not a valid sql
12641290
// word or number.
12651291
if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1266-
if let Some(Token::Word(_)) = prev_token {
1292+
if let Some(PrevTokenKind::Word) = prev_token_kind {
12671293
chars.next();
12681294
return Ok(Some(Token::Period));
12691295
}
@@ -1307,7 +1333,7 @@ impl<'a> Tokenizer<'a> {
13071333
// we should yield the dot as a dedicated token so compound identifiers
13081334
// starting with digits can be parsed correctly.
13091335
if s == "." && self.dialect.supports_numeric_prefix() {
1310-
if let Some(Token::Word(_)) = prev_token {
1336+
if let Some(PrevTokenKind::Word) = prev_token_kind {
13111337
return Ok(Some(Token::Period));
13121338
}
13131339
}
@@ -1366,7 +1392,7 @@ impl<'a> Tokenizer<'a> {
13661392
s += word.as_str();
13671393
return Ok(Some(Token::make_word(s.as_str(), None)));
13681394
}
1369-
} else if prev_token == Some(&Token::Period) {
1395+
} else if prev_token_kind == Some(PrevTokenKind::Period) {
13701396
// If the previous token was a period, thus not belonging to a number,
13711397
// the value we have is part of an identifier.
13721398
return Ok(Some(Token::make_word(s.as_str(), None)));
@@ -2299,6 +2325,34 @@ impl<'a> Tokenizer<'a> {
22992325
}
23002326
}
23012327

2328+
/// Iterator over tokens.
2329+
pub struct TokenWithSpanIter<'a, 'b> {
2330+
state: State<'a>,
2331+
location: Location,
2332+
tokenizer: &'b mut Tokenizer<'a>,
2333+
prev_token_kind: Option<PrevTokenKind>,
2334+
}
2335+
2336+
impl Iterator for TokenWithSpanIter<'_, '_> {
2337+
type Item = Result<TokenWithSpan, TokenizerError>;
2338+
2339+
fn next(&mut self) -> Option<Self::Item> {
2340+
let token = match self
2341+
.tokenizer
2342+
.next_token(&mut self.state, self.prev_token_kind)
2343+
.transpose()?
2344+
{
2345+
Err(err) => return Some(Err(err)),
2346+
Ok(token) => token,
2347+
};
2348+
self.prev_token_kind = Some(PrevTokenKind::from(&token));
2349+
let span = self.location.span_to(self.state.location());
2350+
self.location = self.state.location();
2351+
let token = TokenWithSpan { token, span };
2352+
Some(Ok(token))
2353+
}
2354+
}
2355+
23022356
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
23032357
/// Return the characters read as String, and keep the first non-matching
23042358
/// char available as `chars.next()`.
@@ -2577,6 +2631,39 @@ mod tests {
25772631
compare(expected, tokens);
25782632
}
25792633

2634+
#[test]
2635+
fn tokenize_iterator_map() {
2636+
let sql = String::from("SELECT ?");
2637+
let dialect = GenericDialect {};
2638+
let mut param_num = 1;
2639+
2640+
let tokens = Tokenizer::new(&dialect, &sql)
2641+
.iter()
2642+
.map(|token| {
2643+
let token = token?;
2644+
Ok(match token.token {
2645+
Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2646+
let ret = format!("${}", param_num);
2647+
param_num += 1;
2648+
ret
2649+
} else {
2650+
n
2651+
}),
2652+
_ => token.token,
2653+
})
2654+
})
2655+
.collect::<Result<Vec<_>, TokenizerError>>()
2656+
.unwrap();
2657+
2658+
let expected = vec![
2659+
Token::make_keyword("SELECT"),
2660+
Token::Whitespace(Whitespace::Space),
2661+
Token::Placeholder("$1".to_string()),
2662+
];
2663+
2664+
compare(expected, tokens);
2665+
}
2666+
25802667
#[test]
25812668
fn tokenize_select_float() {
25822669
let sql = String::from("SELECT .1");

0 commit comments

Comments
 (0)