1- //! This module takes care of lexing python source text.
1+ //! This module takes care of lexing Python source text.
22//!
3- //! This means source code is translated into separate tokens.
4-
3+ //! This means source code is scanned and translated into separate tokens. The rules
4+ //! governing what is and is not a valid token are defined in the Python reference
5+ //! guide section on [Lexical analysis].
6+ //!
7+ //! The primary function in this module is [`make_tokenizer`], which takes a string slice
8+ //! and returns an iterator over the tokens in the source code. The tokens are currently returned
9+ //! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
10+ //! start and end [`Location`] and a [`Tok`] denoting the token.
11+ //!
12+ //! # Example
13+ //!
14+ //! ```
15+ //! use rustpython_parser::lexer::{make_tokenizer, Tok};
16+ //! use rustpython_parser::token::StringKind;
17+ //!
18+ //! let source = "x = 'RustPython'";
19+ //! let tokens = make_tokenizer(source)
20+ //! .map(|tok| tok.expect("Failed to lex"))
21+ //! .collect::<Vec<_>>();
22+ //!
23+ //! for (start, token, end) in tokens {
24+ //! println!(
25+ //! "{0},{1}-{2},{3:<5} {token:?}",
26+ //! start.row(),
27+ //! start.column(),
28+ //! end.row(),
29+ //! end.column(),
30+ //! );
31+ //! }
32+ //! ```
33+ //!
34+ //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
535pub use super :: token:: { StringKind , Tok } ;
636use crate :: ast:: Location ;
737use crate :: error:: { LexicalError , LexicalErrorType } ;
@@ -16,6 +46,8 @@ use std::str::FromStr;
1646use unic_emoji_char:: is_emoji_presentation;
1747use unic_ucd_ident:: { is_xid_continue, is_xid_start} ;
1848
49+ // Indentations are tracked by a stack of indentation levels. IndentationLevel keeps
50+ // track of the number of tabs and spaces at the current level.
1951#[ derive( Clone , Copy , PartialEq , Debug , Default ) ]
2052struct IndentationLevel {
2153 tabs : u32 ,
@@ -57,6 +89,9 @@ impl IndentationLevel {
5789 }
5890}
5991
92+ // The indentations stack is used to keep track of the current indentation level.
93+ // Similar to the CPython implementation, the Indentations stack always has at
94+ // least one level which is never popped. See Reference 2.1.8.
6095#[ derive( Debug ) ]
6196struct Indentations {
6297 indent_stack : Vec < IndentationLevel > ,
@@ -93,6 +128,8 @@ impl Default for Indentations {
93128 }
94129}
95130
131+ // A CharWindow is a sliding window over an iterator of chars. It is used to
132+ // allow for look-ahead when scanning tokens from the source code.
96133struct CharWindow < T : Iterator < Item = char > , const N : usize > {
97134 source : T ,
98135 window : [ Option < char > ; N ] ,
@@ -129,28 +166,53 @@ where
129166 }
130167}
131168
169+ /// A lexer for Python source code.
132170pub struct Lexer < T : Iterator < Item = char > > {
171+ // Contains the source code to be lexed.
133172 window : CharWindow < T , 3 > ,
173+ // Are we at the beginning of a line?
134174 at_begin_of_line : bool ,
135- nesting : usize , // Amount of parenthesis
175+ // Amount of parenthesis.
176+ nesting : usize ,
177+ // Indentation levels.
136178 indentations : Indentations ,
137-
179+ // Pending list of tokens to be returned.
138180 pending : Vec < Spanned > ,
181+ // The current location.
139182 location : Location ,
140183}
141184
142185// generated in build.rs, in gen_phf()
186+ /// A map of keywords to their tokens.
143187pub static KEYWORDS : phf:: Map < & ' static str , Tok > =
144188 include ! ( concat!( env!( "OUT_DIR" ) , "/keywords.rs" ) ) ;
145189
190+ /// Contains a Token along with its start and end location.
146191pub type Spanned = ( Location , Tok , Location ) ;
192+ /// The result of lexing a token.
147193pub type LexResult = Result < Spanned , LexicalError > ;
148194
195+ /// Create a new tokenizer from a source string.
196+ ///
197+ /// # Examples
198+ ///
199+ /// ```
200+ /// use rustpython_parser::lexer::{make_tokenizer};
201+ ///
202+ /// let source = "def hello(): return 'world'";
203+ /// let tokenizer = make_tokenizer(source);
204+ ///
205+ /// for token in tokenizer {
206+ /// println!("{:?}", token);
207+ /// }
208+ /// ```
149209#[ inline]
150210pub fn make_tokenizer ( source : & str ) -> impl Iterator < Item = LexResult > + ' _ {
151211 make_tokenizer_located ( source, Location :: default ( ) )
152212}
153213
214+ /// Create a new tokenizer from a source string, starting at a given location.
215+ /// You probably want to use [`make_tokenizer`] instead.
154216pub fn make_tokenizer_located (
155217 source : & str ,
156218 start_location : Location ,
@@ -162,6 +224,8 @@ impl<T> Lexer<T>
162224where
163225 T : Iterator < Item = char > ,
164226{
227+ /// Create a new lexer from T and a starting location. You probably want to use
228+ /// [`make_tokenizer`] instead.
165229 pub fn new ( input : T , start : Location ) -> Self {
166230 let mut lxr = Lexer {
167231 at_begin_of_line : true ,
@@ -172,6 +236,7 @@ where
172236 location : start,
173237 window : CharWindow :: new ( input) ,
174238 } ;
239+ // Fill the window.
175240 lxr. window . slide ( ) ;
176241 lxr. window . slide ( ) ;
177242 lxr. window . slide ( ) ;
@@ -182,7 +247,7 @@ where
182247 lxr
183248 }
184249
185- // Lexer helper functions:
250+ /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
186251 fn lex_identifier ( & mut self ) -> LexResult {
187252 // Detect potential string like rb'' b'' f'' u'' r''
188253 match self . window [ ..3 ] {
@@ -384,7 +449,7 @@ where
384449 }
385450 }
386451
387- /// Skip everything until end of line
452+ /// Lex a single comment.
388453 fn lex_comment ( & mut self ) -> LexResult {
389454 let start_pos = self . get_pos ( ) ;
390455 let mut value = String :: new ( ) ;
@@ -400,6 +465,7 @@ where
400465 }
401466 }
402467
468+ /// Lex a string literal.
403469 fn lex_string ( & mut self , kind : StringKind ) -> LexResult {
404470 let start_pos = self . get_pos ( ) ;
405471 for _ in 0 ..kind. prefix_len ( ) {
@@ -474,13 +540,17 @@ where
474540 Ok ( ( start_pos, tok, end_pos) )
475541 }
476542
543+ // Checks if the character c is a valid starting character as described
544+ // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
477545 fn is_identifier_start ( & self , c : char ) -> bool {
478546 match c {
479547 'a' ..='z' | 'A' ..='Z' | '_' => true ,
480548 _ => is_xid_start ( c) ,
481549 }
482550 }
483551
552+ // Checks if the character c is a valid continuation character as described
553+ // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
484554 fn is_identifier_continuation ( & self ) -> bool {
485555 match self . window [ 0 ] {
486556 Some ( 'a' ..='z' | 'A' ..='Z' | '_' | '0' ..='9' ) => true ,
@@ -489,8 +559,8 @@ where
489559 }
490560 }
491561
492- /// This is the main entry point. Call this function to retrieve the next token.
493- /// This function is used by the iterator implementation.
562+ // This is the main entry point. Call this function to retrieve the next token.
563+ // This function is used by the iterator implementation.
494564 fn inner_next ( & mut self ) -> LexResult {
495565 // top loop, keep on processing, until we have something pending.
496566 while self . pending . is_empty ( ) {
@@ -505,7 +575,7 @@ where
505575 Ok ( self . pending . remove ( 0 ) )
506576 }
507577
508- /// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
578+ // Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
509579 fn eat_indentation ( & mut self ) -> Result < IndentationLevel , LexicalError > {
510580 // Determine indentation:
511581 let mut spaces: u32 = 0 ;
@@ -574,6 +644,7 @@ where
574644 Ok ( IndentationLevel { tabs, spaces } )
575645 }
576646
647+ // Push/pop indents/dedents based on the current indentation level.
577648 fn handle_indentations ( & mut self ) -> Result < ( ) , LexicalError > {
578649 let indentation_level = self . eat_indentation ( ) ?;
579650
@@ -626,10 +697,10 @@ where
626697 Ok ( ( ) )
627698 }
628699
629- /// Take a look at the next character, if any, and decide upon the next steps.
700+ // Take a look at the next character, if any, and decide upon the next steps.
630701 fn consume_normal ( & mut self ) -> Result < ( ) , LexicalError > {
631- // Check if we have some character:
632702 if let Some ( c) = self . window [ 0 ] {
703+ // Identifiers are the most common case.
633704 if self . is_identifier_start ( c) {
634705 let identifier = self . lex_identifier ( ) ?;
635706 self . emit ( identifier) ;
@@ -666,7 +737,7 @@ where
666737 Ok ( ( ) )
667738 }
668739
669- /// Okay, we are facing a weird character, what is it? Determine that .
740+ // Dispatch based on the given character.
670741 fn consume_character ( & mut self , c : char ) -> Result < ( ) , LexicalError > {
671742 match c {
672743 '0' ..='9' => {
@@ -1060,12 +1131,13 @@ where
10601131 location : self . get_pos ( ) ,
10611132 } ) ;
10621133 }
1063- } // Ignore all the rest..
1134+ }
10641135 }
10651136
10661137 Ok ( ( ) )
10671138 }
10681139
1140+ // Used by single character tokens to advance the window and emit the correct token.
10691141 fn eat_single_char ( & mut self , ty : Tok ) {
10701142 let tok_start = self . get_pos ( ) ;
10711143 self . next_char ( ) . unwrap_or_else ( || unsafe {
@@ -1077,7 +1149,7 @@ where
10771149 self . emit ( ( tok_start, ty, tok_end) ) ;
10781150 }
10791151
1080- /// Helper function to go to the next character coming up.
1152+ // Helper function to go to the next character coming up.
10811153 fn next_char ( & mut self ) -> Option < char > {
10821154 let mut c = self . window [ 0 ] ;
10831155 self . window . slide ( ) ;
@@ -1099,32 +1171,27 @@ where
10991171 c
11001172 }
11011173
1102- /// Helper function to retrieve the current position.
1174+ // Helper function to retrieve the current position.
11031175 fn get_pos ( & self ) -> Location {
11041176 self . location
11051177 }
11061178
1107- /// Helper function to emit a lexed token to the queue of tokens.
1179+ // Helper function to emit a lexed token to the queue of tokens.
11081180 fn emit ( & mut self , spanned : Spanned ) {
11091181 self . pending . push ( spanned) ;
11101182 }
11111183}
11121184
1113- /* Implement iterator pattern for the get_tok function.
1114-
1115- Calling the next element in the iterator will yield the next lexical
1116- token.
1117- */
1185+ // Implement iterator pattern for Lexer.
1186+ // Calling the next element in the iterator will yield the next lexical
1187+ // token.
11181188impl < T > Iterator for Lexer < T >
11191189where
11201190 T : Iterator < Item = char > ,
11211191{
11221192 type Item = LexResult ;
11231193
11241194 fn next ( & mut self ) -> Option < Self :: Item > {
1125- // Idea: create some sort of hash map for single char tokens:
1126- // let mut X = HashMap::new();
1127- // X.insert('=', Tok::Equal);
11281195 let token = self . inner_next ( ) ;
11291196 trace ! (
11301197 "Lex token {:?}, nesting={:?}, indent stack: {:?}" ,
0 commit comments