1+ use itertools:: { Itertools , MultiPeek } ;
2+
13use crate :: lexer:: { LexResult , Tok } ;
24pub use crate :: mode:: Mode ;
35
4- /// Collect all tokens from a token stream in a vector.
5- fn collect_tokens ( tokenizer : impl IntoIterator < Item = LexResult > ) -> Vec < LexResult > {
6- let mut tokens: Vec < LexResult > = vec ! [ ] ;
7- for tok in tokenizer {
8- let is_err = tok. is_err ( ) ;
9- tokens. push ( tok) ;
10- if is_err {
11- break ;
12- }
13- }
14- tokens
15- }
16-
17- /// Modify a token stream to accommodate soft keywords (namely, `match` and `case`).
6+ /// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match`
7+ /// and `case`).
188///
199/// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords
2010/// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain
@@ -25,93 +15,112 @@ fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResu
2515///
2616/// Handling soft keywords in this intermediary pass allows us to simplify both the lexer and
2717/// parser, as neither of them need to be aware of soft keywords.
28- pub fn soft_keywords (
29- tokenizer : impl IntoIterator < Item = LexResult > ,
30- mode : Mode ,
31- ) -> Vec < LexResult > {
32- let mut tokenizer: Vec < LexResult > = collect_tokens ( tokenizer) ;
33- let mut start_of_line = matches ! ( mode, Mode :: Module | Mode :: Interactive ) ;
34- for i in 0 ..tokenizer. len ( ) {
35- // If the token is a `match` or `case` token, check if it's used as an identifier.
36- // We assume every `match` or `case` is an identifier unless both of the following
37- // conditions are met:
38- // 1. The token is at the start of a logical line.
39- // 2. The logical line contains a top-level colon (that is, a colon that is not nested
40- // inside a parenthesized expression, list, or dictionary).
41- // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
42- // (This is to avoid treating `match` and `case` as identifiers when annotated with
43- // type hints.)
44- if tokenizer[ i]
45- . as_ref ( )
46- . map_or ( false , |( _, tok, _) | matches ! ( tok, Tok :: Match | Tok :: Case ) )
47- {
48- let is_identifier = {
49- if !start_of_line {
50- // If the `match` or `case` token is not at the start of a line, it's definitely
51- // an identifier.
52- true
18+ pub struct SoftKeywordTransformer < I >
19+ where
20+ I : Iterator < Item = LexResult > ,
21+ {
22+ pub underlying : MultiPeek < I > ,
23+ pub start_of_line : bool ,
24+ }
25+
26+ impl < I > SoftKeywordTransformer < I >
27+ where
28+ I : Iterator < Item = LexResult > ,
29+ {
30+ pub fn new ( tokenizer : I , mode : Mode ) -> Self {
31+ Self {
32+ underlying : tokenizer. multipeek ( ) ,
33+ start_of_line : matches ! ( mode, Mode :: Interactive | Mode :: Module ) ,
34+ }
35+ }
36+ }
37+
38+ impl < I > Iterator for SoftKeywordTransformer < I >
39+ where
40+ I : Iterator < Item = LexResult > ,
41+ {
42+ type Item = LexResult ;
43+
44+ #[ inline]
45+ fn next ( & mut self ) -> Option < LexResult > {
46+ let mut next = self . underlying . next ( ) ;
47+ if let Some ( Ok ( ( start, tok, end) ) ) = next. as_ref ( ) {
48+ // If the token is a `match` or `case` token, check if it's used as an identifier.
49+ // We assume every `match` or `case` is an identifier unless both of the following
50+ // conditions are met:
51+ // 1. The token is at the start of a logical line.
52+ // 2. The logical line contains a top-level colon (that is, a colon that is not nested
53+ // inside a parenthesized expression, list, or dictionary).
54+ // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
55+ // (This is to avoid treating `match` and `case` as identifiers when annotated with
56+ // type hints.)
57+ if matches ! ( tok, Tok :: Match | Tok :: Case ) {
58+ if !self . start_of_line {
59+ next = Some ( Ok ( (
60+ * start,
61+ Tok :: Name {
62+ name : if matches ! ( tok, Tok :: Match ) {
63+ "match" . to_string ( )
64+ } else {
65+ "case" . to_string ( )
66+ } ,
67+ } ,
68+ * end,
69+ ) ) ) ;
5370 } else {
54- //
55- let mut seen_colon = false ;
56- let mut first = true ;
5771 let mut par_count = 0 ;
5872 let mut sqb_count = 0 ;
5973 let mut brace_count = 0 ;
60- for ( _, tok, _) in tokenizer. iter ( ) . skip ( i + 1 ) . flatten ( ) {
74+ let mut first = true ;
75+ let mut seen_colon = false ;
76+ while let Some ( Ok ( ( _, tok, _) ) ) = self . underlying . peek ( ) {
6177 match tok {
6278 Tok :: Newline => break ,
6379 Tok :: Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => {
6480 if !first {
6581 seen_colon = true ;
6682 }
67- break ;
68- }
69- Tok :: Lpar => {
70- par_count += 1 ;
71- }
72- Tok :: Rpar => {
73- par_count -= 1 ;
74- }
75- Tok :: Lsqb => {
76- sqb_count += 1 ;
77- }
78- Tok :: Rsqb => {
79- sqb_count -= 1 ;
80- }
81- Tok :: Lbrace => {
82- brace_count += 1 ;
83- }
84- Tok :: Rbrace => {
85- brace_count -= 1 ;
8683 }
84+ Tok :: Lpar => par_count += 1 ,
85+ Tok :: Rpar => par_count -= 1 ,
86+ Tok :: Lsqb => sqb_count += 1 ,
87+ Tok :: Rsqb => sqb_count -= 1 ,
88+ Tok :: Lbrace => brace_count += 1 ,
89+ Tok :: Rbrace => brace_count -= 1 ,
8790 _ => { }
8891 }
8992 first = false ;
9093 }
91- !seen_colon
92- }
93- } ;
94- if is_identifier {
95- if let Ok ( ( _, tok, _) ) = & mut tokenizer[ i] {
96- if let Tok :: Match = tok {
97- * tok = Tok :: Name {
98- name : "match" . to_string ( ) ,
99- } ;
100- } else if let Tok :: Case = tok {
101- * tok = Tok :: Name {
102- name : "case" . to_string ( ) ,
103- } ;
94+ if !seen_colon {
95+ next = Some ( Ok ( (
96+ * start,
97+ Tok :: Name {
98+ name : if matches ! ( tok, Tok :: Match ) {
99+ "match" . to_string ( )
100+ } else {
101+ "case" . to_string ( )
102+ } ,
103+ } ,
104+ * end,
105+ ) ) ) ;
104106 }
105107 }
106108 }
107109 }
108- start_of_line = tokenizer[ i] . as_ref ( ) . map_or ( false , |( _, tok, _) | {
109- matches ! (
110- tok,
111- Tok :: StartModule | Tok :: StartInteractive | Tok :: Newline | Tok :: Indent | Tok :: Dedent
112- )
110+
111+ self . start_of_line = next. as_ref ( ) . map_or ( false , |lex_result| {
112+ lex_result. as_ref ( ) . map_or ( false , |( _, tok, _) | {
113+ matches ! (
114+ tok,
115+ Tok :: StartModule
116+ | Tok :: StartInteractive
117+ | Tok :: Newline
118+ | Tok :: Indent
119+ | Tok :: Dedent
120+ )
121+ } )
113122 } ) ;
114- }
115123
116- tokenizer
124+ next
125+ }
117126}
0 commit comments