@@ -857,6 +857,26 @@ pub struct Tokenizer<'a> {
857857 unescape : bool ,
858858}
859859
860+ /// Passed into [`Tokenizer::next_token`] as in some situations tokenization
861+ /// is context dependent. The separate enum is used to be able to not clone
862+ /// the previous token during [`TokenWithLocationIter`] iteration.
863+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
864+ enum PrevTokenKind {
865+ Word ,
866+ Period ,
867+ Other ,
868+ }
869+
870+ impl From < & Token > for PrevTokenKind {
871+ fn from ( value : & Token ) -> Self {
872+ match value {
873+ Token :: Word ( _) => Self :: Word ,
874+ Token :: Period => Self :: Period ,
875+ _ => Self :: Other ,
876+ }
877+ }
878+ }
879+
860880impl < ' a > Tokenizer < ' a > {
861881 /// Create a new SQL tokenizer for the specified SQL statement
862882 ///
@@ -917,6 +937,23 @@ impl<'a> Tokenizer<'a> {
917937 self
918938 }
919939
940+ /// Return an iterator over tokens
941+ pub fn iter ( & mut self ) -> TokenWithSpanIter < ' a , ' _ > {
942+ let state = State {
943+ peekable : self . query . chars ( ) . peekable ( ) ,
944+ line : 1 ,
945+ col : 1 ,
946+ } ;
947+
948+ let location = state. location ( ) ;
949+ TokenWithSpanIter {
950+ state,
951+ location,
952+ tokenizer : self ,
953+ prev_token_kind : None ,
954+ }
955+ }
956+
920957 /// Tokenize the statement and produce a vector of tokens
921958 pub fn tokenize ( & mut self ) -> Result < Vec < Token > , TokenizerError > {
922959 let twl = self . tokenize_with_location ( ) ?;
@@ -936,19 +973,8 @@ impl<'a> Tokenizer<'a> {
936973 & mut self ,
937974 buf : & mut Vec < TokenWithSpan > ,
938975 ) -> Result < ( ) , TokenizerError > {
939- let mut state = State {
940- peekable : self . query . chars ( ) . peekable ( ) ,
941- line : 1 ,
942- col : 1 ,
943- } ;
944-
945- let mut location = state. location ( ) ;
946- while let Some ( token) = self . next_token ( & mut state, buf. last ( ) . map ( |t| & t. token ) ) ? {
947- let span = location. span_to ( state. location ( ) ) ;
948-
949- buf. push ( TokenWithSpan { token, span } ) ;
950-
951- location = state. location ( ) ;
976+ for token in self . iter ( ) {
977+ buf. push ( token?) ;
952978 }
953979 Ok ( ( ) )
954980 }
@@ -983,7 +1009,7 @@ impl<'a> Tokenizer<'a> {
9831009 fn next_token (
9841010 & self ,
9851011 chars : & mut State ,
986- prev_token : Option < & Token > ,
1012+ prev_token_kind : Option < PrevTokenKind > ,
9871013 ) -> Result < Option < Token > , TokenizerError > {
9881014 match chars. peek ( ) {
9891015 Some ( & ch) => match ch {
@@ -1263,7 +1289,7 @@ impl<'a> Tokenizer<'a> {
12631289 // if the prev token is not a word, then this is not a valid sql
12641290 // word or number.
12651291 if ch == '.' && chars. peekable . clone ( ) . nth ( 1 ) == Some ( '_' ) {
1266- if let Some ( Token :: Word ( _ ) ) = prev_token {
1292+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
12671293 chars. next ( ) ;
12681294 return Ok ( Some ( Token :: Period ) ) ;
12691295 }
@@ -1307,7 +1333,7 @@ impl<'a> Tokenizer<'a> {
13071333 // we should yield the dot as a dedicated token so compound identifiers
13081334 // starting with digits can be parsed correctly.
13091335 if s == "." && self . dialect . supports_numeric_prefix ( ) {
1310- if let Some ( Token :: Word ( _ ) ) = prev_token {
1336+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
13111337 return Ok ( Some ( Token :: Period ) ) ;
13121338 }
13131339 }
@@ -1366,7 +1392,7 @@ impl<'a> Tokenizer<'a> {
13661392 s += word. as_str ( ) ;
13671393 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
13681394 }
1369- } else if prev_token == Some ( & Token :: Period ) {
1395+ } else if prev_token_kind == Some ( PrevTokenKind :: Period ) {
13701396 // If the previous token was a period, thus not belonging to a number,
13711397 // the value we have is part of an identifier.
13721398 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
@@ -2299,6 +2325,34 @@ impl<'a> Tokenizer<'a> {
22992325 }
23002326}
23012327
2328+ /// Iterator over tokens.
2329+ pub struct TokenWithSpanIter < ' a , ' b > {
2330+ state : State < ' a > ,
2331+ location : Location ,
2332+ tokenizer : & ' b mut Tokenizer < ' a > ,
2333+ prev_token_kind : Option < PrevTokenKind > ,
2334+ }
2335+
2336+ impl Iterator for TokenWithSpanIter < ' _ , ' _ > {
2337+ type Item = Result < TokenWithSpan , TokenizerError > ;
2338+
2339+ fn next ( & mut self ) -> Option < Self :: Item > {
2340+ let token = match self
2341+ . tokenizer
2342+ . next_token ( & mut self . state , self . prev_token_kind )
2343+ . transpose ( ) ?
2344+ {
2345+ Err ( err) => return Some ( Err ( err) ) ,
2346+ Ok ( token) => token,
2347+ } ;
2348+ self . prev_token_kind = Some ( PrevTokenKind :: from ( & token) ) ;
2349+ let span = self . location . span_to ( self . state . location ( ) ) ;
2350+ self . location = self . state . location ( ) ;
2351+ let token = TokenWithSpan { token, span } ;
2352+ Some ( Ok ( token) )
2353+ }
2354+ }
2355+
23022356/// Read from `chars` until `predicate` returns `false` or EOF is hit.
23032357/// Return the characters read as String, and keep the first non-matching
23042358/// char available as `chars.next()`.
@@ -2577,6 +2631,39 @@ mod tests {
25772631 compare ( expected, tokens) ;
25782632 }
25792633
2634+ #[ test]
2635+ fn tokenize_iterator_map ( ) {
2636+ let sql = String :: from ( "SELECT ?" ) ;
2637+ let dialect = GenericDialect { } ;
2638+ let mut param_num = 1 ;
2639+
2640+ let tokens = Tokenizer :: new ( & dialect, & sql)
2641+ . iter ( )
2642+ . map ( |token| {
2643+ let token = token?;
2644+ Ok ( match token. token {
2645+ Token :: Placeholder ( n) => Token :: Placeholder ( if n == "?" {
2646+ let ret = format ! ( "${}" , param_num) ;
2647+ param_num += 1 ;
2648+ ret
2649+ } else {
2650+ n
2651+ } ) ,
2652+ _ => token. token ,
2653+ } )
2654+ } )
2655+ . collect :: < Result < Vec < _ > , TokenizerError > > ( )
2656+ . unwrap ( ) ;
2657+
2658+ let expected = vec ! [
2659+ Token :: make_keyword( "SELECT" ) ,
2660+ Token :: Whitespace ( Whitespace :: Space ) ,
2661+ Token :: Placeholder ( "$1" . to_string( ) ) ,
2662+ ] ;
2663+
2664+ compare ( expected, tokens) ;
2665+ }
2666+
25802667 #[ test]
25812668 fn tokenize_select_float ( ) {
25822669 let sql = String :: from ( "SELECT .1" ) ;
0 commit comments