Skip to content

Commit 1e97596

Browse files
committed
Document lexer.
1 parent e4096fb commit 1e97596

File tree

1 file changed

+92
-25
lines changed

1 file changed

+92
-25
lines changed

compiler/parser/src/lexer.rs

Lines changed: 92 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,37 @@
1-
//! This module takes care of lexing python source text.
1+
//! This module takes care of lexing Python source text.
22
//!
3-
//! This means source code is translated into separate tokens.
4-
3+
//! This means source code is scanned and translated into separate tokens. The rules
4+
//! governing what is and is not a valid token are defined in the Python reference
5+
//! guide section on [Lexical analysis].
6+
//!
7+
//! The primary function in this module is [`make_tokenizer`], which takes a string slice
8+
//! and returns an iterator over the tokens in the source code. The tokens are currently returned
9+
//! as a `Result<Spanned, LexicalError>`, where [`Spanned`] is a tuple containing the
10+
//! start and end [`Location`] and a [`Tok`] denoting the token.
11+
//!
12+
//! # Example
13+
//!
14+
//! ```
15+
//! use rustpython_parser::lexer::{make_tokenizer, Tok};
16+
//! use rustpython_parser::token::StringKind;
17+
//!
18+
//! let source = "x = 'RustPython'";
19+
//! let tokens = make_tokenizer(source)
20+
//! .map(|tok| tok.expect("Failed to lex"))
21+
//! .collect::<Vec<_>>();
22+
//!
23+
//! for (start, token, end) in tokens {
24+
//! println!(
25+
//! "{0},{1}-{2},{3:<5} {token:?}",
26+
//! start.row(),
27+
//! start.column(),
28+
//! end.row(),
29+
//! end.column(),
30+
//! );
31+
//! }
32+
//! ```
33+
//!
34+
//! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html
535
pub use super::token::{StringKind, Tok};
636
use crate::ast::Location;
737
use crate::error::{LexicalError, LexicalErrorType};
@@ -16,6 +46,8 @@ use std::str::FromStr;
1646
use unic_emoji_char::is_emoji_presentation;
1747
use unic_ucd_ident::{is_xid_continue, is_xid_start};
1848

49+
// Indentations are tracked by a stack of indentation levels. IndentationLevel keeps
50+
// track of the number of tabs and spaces at the current level.
1951
#[derive(Clone, Copy, PartialEq, Debug, Default)]
2052
struct IndentationLevel {
2153
tabs: u32,
@@ -57,6 +89,9 @@ impl IndentationLevel {
5789
}
5890
}
5991

92+
// The indentations stack is used to keep track of the current indentation level.
93+
// Similar to the CPython implementation, the Indentations stack always has at
94+
// least one level which is never popped. See Reference 2.1.8.
6095
#[derive(Debug)]
6196
struct Indentations {
6297
indent_stack: Vec<IndentationLevel>,
@@ -93,6 +128,8 @@ impl Default for Indentations {
93128
}
94129
}
95130

131+
// A CharWindow is a sliding window over an iterator of chars. It is used to
132+
// allow for look-ahead when scanning tokens from the source code.
96133
struct CharWindow<T: Iterator<Item = char>, const N: usize> {
97134
source: T,
98135
window: [Option<char>; N],
@@ -129,28 +166,53 @@ where
129166
}
130167
}
131168

169+
/// A lexer for Python source code.
132170
pub struct Lexer<T: Iterator<Item = char>> {
171+
// Contains the source code to be lexed.
133172
window: CharWindow<T, 3>,
173+
// Are we at the beginning of a line?
134174
at_begin_of_line: bool,
135-
nesting: usize, // Amount of parenthesis
175+
// Amount of parenthesis.
176+
nesting: usize,
177+
// Indentation levels.
136178
indentations: Indentations,
137-
179+
// Pending list of tokens to be returned.
138180
pending: Vec<Spanned>,
181+
// The current location.
139182
location: Location,
140183
}
141184

142185
// generated in build.rs, in gen_phf()
186+
/// A map of keywords to their tokens.
143187
pub static KEYWORDS: phf::Map<&'static str, Tok> =
144188
include!(concat!(env!("OUT_DIR"), "/keywords.rs"));
145189

190+
/// Contains a Token along with its start and end location.
146191
pub type Spanned = (Location, Tok, Location);
192+
/// The result of lexing a token.
147193
pub type LexResult = Result<Spanned, LexicalError>;
148194

195+
/// Create a new tokenizer from a source string.
196+
///
197+
/// # Examples
198+
///
199+
/// ```
200+
/// use rustpython_parser::lexer::{make_tokenizer};
201+
///
202+
/// let source = "def hello(): return 'world'";
203+
/// let tokenizer = make_tokenizer(source);
204+
///
205+
/// for token in tokenizer {
206+
/// println!("{:?}", token);
207+
/// }
208+
/// ```
149209
#[inline]
150210
pub fn make_tokenizer(source: &str) -> impl Iterator<Item = LexResult> + '_ {
151211
make_tokenizer_located(source, Location::default())
152212
}
153213

214+
/// Create a new tokenizer from a source string, starting at a given location.
215+
/// You probably want to use [`make_tokenizer`] instead.
154216
pub fn make_tokenizer_located(
155217
source: &str,
156218
start_location: Location,
@@ -162,6 +224,8 @@ impl<T> Lexer<T>
162224
where
163225
T: Iterator<Item = char>,
164226
{
227+
/// Create a new lexer from T and a starting location. You probably want to use
228+
/// [`make_tokenizer`] instead.
165229
pub fn new(input: T, start: Location) -> Self {
166230
let mut lxr = Lexer {
167231
at_begin_of_line: true,
@@ -172,6 +236,7 @@ where
172236
location: start,
173237
window: CharWindow::new(input),
174238
};
239+
// Fill the window.
175240
lxr.window.slide();
176241
lxr.window.slide();
177242
lxr.window.slide();
@@ -182,7 +247,7 @@ where
182247
lxr
183248
}
184249

185-
// Lexer helper functions:
250+
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
186251
fn lex_identifier(&mut self) -> LexResult {
187252
// Detect potential string like rb'' b'' f'' u'' r''
188253
match self.window[..3] {
@@ -384,7 +449,7 @@ where
384449
}
385450
}
386451

387-
/// Skip everything until end of line
452+
/// Lex a single comment.
388453
fn lex_comment(&mut self) -> LexResult {
389454
let start_pos = self.get_pos();
390455
let mut value = String::new();
@@ -400,6 +465,7 @@ where
400465
}
401466
}
402467

468+
/// Lex a string literal.
403469
fn lex_string(&mut self, kind: StringKind) -> LexResult {
404470
let start_pos = self.get_pos();
405471
for _ in 0..kind.prefix_len() {
@@ -474,13 +540,17 @@ where
474540
Ok((start_pos, tok, end_pos))
475541
}
476542

543+
// Checks if the character c is a valid starting character as described
544+
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
477545
fn is_identifier_start(&self, c: char) -> bool {
478546
match c {
479547
'a'..='z' | 'A'..='Z' | '_' => true,
480548
_ => is_xid_start(c),
481549
}
482550
}
483551

552+
// Checks if the character c is a valid continuation character as described
553+
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
484554
fn is_identifier_continuation(&self) -> bool {
485555
match self.window[0] {
486556
Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true,
@@ -489,8 +559,8 @@ where
489559
}
490560
}
491561

492-
/// This is the main entry point. Call this function to retrieve the next token.
493-
/// This function is used by the iterator implementation.
562+
// This is the main entry point. Call this function to retrieve the next token.
563+
// This function is used by the iterator implementation.
494564
fn inner_next(&mut self) -> LexResult {
495565
// top loop, keep on processing, until we have something pending.
496566
while self.pending.is_empty() {
@@ -505,7 +575,7 @@ where
505575
Ok(self.pending.remove(0))
506576
}
507577

508-
/// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
578+
// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
509579
fn eat_indentation(&mut self) -> Result<IndentationLevel, LexicalError> {
510580
// Determine indentation:
511581
let mut spaces: u32 = 0;
@@ -574,6 +644,7 @@ where
574644
Ok(IndentationLevel { tabs, spaces })
575645
}
576646

647+
// Push/pop indents/dedents based on the current indentation level.
577648
fn handle_indentations(&mut self) -> Result<(), LexicalError> {
578649
let indentation_level = self.eat_indentation()?;
579650

@@ -626,10 +697,10 @@ where
626697
Ok(())
627698
}
628699

629-
/// Take a look at the next character, if any, and decide upon the next steps.
700+
// Take a look at the next character, if any, and decide upon the next steps.
630701
fn consume_normal(&mut self) -> Result<(), LexicalError> {
631-
// Check if we have some character:
632702
if let Some(c) = self.window[0] {
703+
// Identifiers are the most common case.
633704
if self.is_identifier_start(c) {
634705
let identifier = self.lex_identifier()?;
635706
self.emit(identifier);
@@ -666,7 +737,7 @@ where
666737
Ok(())
667738
}
668739

669-
/// Okay, we are facing a weird character, what is it? Determine that.
740+
// Dispatch based on the given character.
670741
fn consume_character(&mut self, c: char) -> Result<(), LexicalError> {
671742
match c {
672743
'0'..='9' => {
@@ -1060,12 +1131,13 @@ where
10601131
location: self.get_pos(),
10611132
});
10621133
}
1063-
} // Ignore all the rest..
1134+
}
10641135
}
10651136

10661137
Ok(())
10671138
}
10681139

1140+
// Used by single character tokens to advance the window and emit the correct token.
10691141
fn eat_single_char(&mut self, ty: Tok) {
10701142
let tok_start = self.get_pos();
10711143
self.next_char().unwrap_or_else(|| unsafe {
@@ -1077,7 +1149,7 @@ where
10771149
self.emit((tok_start, ty, tok_end));
10781150
}
10791151

1080-
/// Helper function to go to the next character coming up.
1152+
// Helper function to go to the next character coming up.
10811153
fn next_char(&mut self) -> Option<char> {
10821154
let mut c = self.window[0];
10831155
self.window.slide();
@@ -1099,32 +1171,27 @@ where
10991171
c
11001172
}
11011173

1102-
/// Helper function to retrieve the current position.
1174+
// Helper function to retrieve the current position.
11031175
fn get_pos(&self) -> Location {
11041176
self.location
11051177
}
11061178

1107-
/// Helper function to emit a lexed token to the queue of tokens.
1179+
// Helper function to emit a lexed token to the queue of tokens.
11081180
fn emit(&mut self, spanned: Spanned) {
11091181
self.pending.push(spanned);
11101182
}
11111183
}
11121184

1113-
/* Implement iterator pattern for the get_tok function.
1114-
1115-
Calling the next element in the iterator will yield the next lexical
1116-
token.
1117-
*/
1185+
// Implement iterator pattern for Lexer.
1186+
// Calling the next element in the iterator will yield the next lexical
1187+
// token.
11181188
impl<T> Iterator for Lexer<T>
11191189
where
11201190
T: Iterator<Item = char>,
11211191
{
11221192
type Item = LexResult;
11231193

11241194
fn next(&mut self) -> Option<Self::Item> {
1125-
// Idea: create some sort of hash map for single char tokens:
1126-
// let mut X = HashMap::new();
1127-
// X.insert('=', Tok::Equal);
11281195
let token = self.inner_next();
11291196
trace!(
11301197
"Lex token {:?}, nesting={:?}, indent stack: {:?}",

0 commit comments

Comments
 (0)