Skip to content

Commit 96bfe40

Browse files
authored
Merge pull request RustPython#4492 from DimitrisJim/doc_parser_uno
Document parser crate.
2 parents 271bfcb + 5e40168 commit 96bfe40

File tree

6 files changed

+420
-77
lines changed

6 files changed

+420
-77
lines changed

compiler/parser/src/context.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use rustpython_ast::{Expr, ExprContext, ExprKind};
22

3-
pub fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
3+
pub(crate) fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
44
match expr.node {
55
ExprKind::Name { id, .. } => Expr {
66
node: ExprKind::Name { id, ctx },

compiler/parser/src/error.rs

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,71 @@
1-
//! Define internal parse error types
2-
//! The goal is to provide a matching and a safe error API, maksing errors from LALR
1+
//! Error types for the parser.
2+
//!
3+
//! These types are used to represent errors that occur during lexing and parsing and are
4+
//! returned by the `parse_*` functions in the [parser] module and the iterator in the
5+
//! [lexer] implementation.
6+
//!
7+
//! [parser]: crate::parser
8+
//! [lexer]: crate::lexer
39
10+
// Define internal parse error types.
11+
// The goal is to provide a matching and a safe error API, masking errors from LALR
412
use crate::{ast::Location, token::Tok};
513
use lalrpop_util::ParseError as LalrpopError;
614
use std::fmt;
715

8-
/// Represents an error during lexical scanning.
16+
/// Represents an error during lexing.
917
#[derive(Debug, PartialEq)]
1018
pub struct LexicalError {
19+
/// The type of error that occurred.
1120
pub error: LexicalErrorType,
21+
/// The location of the error.
1222
pub location: Location,
1323
}
1424

1525
impl LexicalError {
26+
/// Creates a new `LexicalError` with the given error type and location.
1627
pub fn new(error: LexicalErrorType, location: Location) -> Self {
1728
Self { error, location }
1829
}
1930
}
2031

32+
/// Represents the different types of errors that can occur during lexing.
2133
#[derive(Debug, PartialEq)]
2234
pub enum LexicalErrorType {
35+
// TODO: Can probably be removed, the places it is used seem to be able
36+
// to use the `UnicodeError` variant instead.
37+
#[doc(hidden)]
2338
StringError,
39+
// TODO: Should take a start/end position to report.
40+
/// Decoding of a unicode escape sequence in a string literal failed.
2441
UnicodeError,
42+
/// The nesting of brackets/braces/parentheses is not balanced.
2543
NestingError,
44+
/// The indentation is not consistent.
2645
IndentationError,
46+
/// Inconsistent use of tabs and spaces.
2747
TabError,
48+
/// Encountered a tab after a space.
2849
TabsAfterSpaces,
50+
/// A non-default argument follows a default argument.
2951
DefaultArgumentError,
52+
/// A duplicate argument was found in a function definition.
3053
DuplicateArgumentError(String),
54+
/// A positional argument follows a keyword argument.
3155
PositionalArgumentError,
56+
/// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
3257
UnpackedArgumentError,
58+
/// A keyword argument was repeated.
3359
DuplicateKeywordArgumentError(String),
60+
/// An unrecognized token was encountered.
3461
UnrecognizedToken { tok: char },
62+
/// An f-string error containing the [`FStringErrorType`].
3563
FStringError(FStringErrorType),
64+
/// An unexpected character was encountered after a line continuation.
3665
LineContinuationError,
66+
/// An unexpected end of file was encountered.
3767
Eof,
68+
/// An unexpected error occurred.
3869
OtherError(String),
3970
}
4071

@@ -85,13 +116,17 @@ impl fmt::Display for LexicalErrorType {
85116
}
86117

87118
// TODO: consolidate these with ParseError
119+
/// An error that occurred during parsing of an f-string.
88120
#[derive(Debug, PartialEq)]
89121
pub struct FStringError {
122+
/// The type of error that occurred.
90123
pub error: FStringErrorType,
124+
/// The location of the error.
91125
pub location: Location,
92126
}
93127

94128
impl FStringError {
129+
/// Creates a new `FStringError` with the given error type and location.
95130
pub fn new(error: FStringErrorType, location: Location) -> Self {
96131
Self { error, location }
97132
}
@@ -106,19 +141,33 @@ impl From<FStringError> for LexicalError {
106141
}
107142
}
108143

144+
/// Represents the different types of errors that can occur during parsing of an f-string.
109145
#[derive(Debug, PartialEq)]
110146
pub enum FStringErrorType {
147+
/// Expected a right brace after an opened left brace.
111148
UnclosedLbrace,
149+
/// Expected a left brace after an ending right brace.
112150
UnopenedRbrace,
151+
/// Expected a right brace after a conversion flag.
113152
ExpectedRbrace,
153+
/// An error occurred while parsing an f-string expression.
114154
InvalidExpression(Box<ParseErrorType>),
155+
/// An invalid conversion flag was encountered.
115156
InvalidConversionFlag,
157+
/// An empty expression was encountered.
116158
EmptyExpression,
159+
/// An opening delimiter was not closed properly.
117160
MismatchedDelimiter(char, char),
161+
/// Too many nested expressions in an f-string.
118162
ExpressionNestedTooDeeply,
163+
/// The f-string expression cannot include the given character.
119164
ExpressionCannotInclude(char),
165+
/// A single right brace was encountered.
120166
SingleRbrace,
167+
/// A closing delimiter was not opened properly.
121168
Unmatched(char),
169+
// TODO: Test this case.
170+
/// Unterminated string.
122171
UnterminatedString,
123172
}
124173

@@ -167,9 +216,10 @@ impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
167216
}
168217
}
169218

170-
/// Represents an error during parsing
219+
/// Represents an error during parsing.
171220
pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;
172221

222+
/// Represents the different types of errors that can occur during parsing.
173223
#[derive(Debug, PartialEq, thiserror::Error)]
174224
pub enum ParseErrorType {
175225
/// Parser encountered an unexpected end of input
@@ -180,11 +230,12 @@ pub enum ParseErrorType {
180230
InvalidToken,
181231
/// Parser encountered an unexpected token
182232
UnrecognizedToken(Tok, Option<String>),
183-
/// Maps to `User` type from `lalrpop-util`
233+
// Maps to `User` type from `lalrpop-util`
234+
/// Parser encountered an error during lexing.
184235
Lexical(LexicalErrorType),
185236
}
186237

187-
/// Convert `lalrpop_util::ParseError` to our internal type
238+
// Convert `lalrpop_util::ParseError` to our internal type
188239
pub(crate) fn parse_error_from_lalrpop(
189240
err: LalrpopError<Location, Tok, LexicalError>,
190241
source_path: &str,
@@ -258,6 +309,7 @@ impl fmt::Display for ParseErrorType {
258309
}
259310

260311
impl ParseErrorType {
312+
/// Returns true if the error is an indentation error.
261313
pub fn is_indentation_error(&self) -> bool {
262314
match self {
263315
ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
@@ -267,6 +319,8 @@ impl ParseErrorType {
267319
_ => false,
268320
}
269321
}
322+
323+
/// Returns true if the error is a tab error.
270324
pub fn is_tab_error(&self) -> bool {
271325
matches!(
272326
self,

compiler/parser/src/lib.rs

Lines changed: 109 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,119 @@
1-
//! This crate can be used to parse python sourcecode into a so
2-
//! called AST (abstract syntax tree).
1+
//! This crate can be used to parse Python source code into an Abstract
2+
//! Syntax Tree.
33
//!
4-
//! The stages involved in this process are lexical analysis and
5-
//! parsing. The lexical analysis splits the sourcecode into
6-
//! tokens, and the parsing transforms those tokens into an AST.
4+
//! ## Overview:
75
//!
8-
//! For example, one could do this:
6+
//! The process by which source code is parsed into an AST can be broken down
7+
//! into two general stages: [lexical analysis] and [parsing].
98
//!
9+
//! During lexical analysis, the source code is converted into a stream of lexical
10+
//! tokens that represent the smallest meaningful units of the language. For example,
11+
//! the source code `print("Hello world")` would _roughly_ be converted into the following
12+
//! stream of tokens:
13+
//!
14+
//! ```text
15+
//! Name("print"), LeftParen, String("Hello world"), RightParen
1016
//! ```
11-
//! use rustpython_parser::{parser, ast};
1217
//!
13-
//! let python_source = "print('Hello world')";
14-
//! let python_ast = parser::parse_expression(python_source, "<embedded>").unwrap();
18+
//! these tokens are then consumed by the parser, which matches them against a set of
19+
//! grammar rules to verify that the source code is syntactically valid and to construct
20+
//! an AST that represents the source code.
21+
//!
22+
//! During parsing, the parser consumes the tokens generated by the lexer and constructs
23+
//! a tree representation of the source code. The tree is made up of nodes that represent
24+
//! the different syntactic constructs of the language. If the source code is syntactically
25+
//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
26+
//! be used to perform further analysis on the source code. Continuing with the example
27+
//! above, the AST generated by the parser would _roughly_ look something like this:
28+
//!
29+
//! ```text
30+
//! node: Expr {
31+
//! value: {
32+
//! node: Call {
33+
//! func: {
34+
//! node: Name {
35+
//! id: "print",
36+
//! ctx: Load,
37+
//! },
38+
//! },
39+
//! args: [
40+
//! node: Constant {
41+
//! value: Str("Hello World"),
42+
//! kind: None,
43+
//! },
44+
//! ],
45+
//! keywords: [],
46+
//! },
47+
//! },
48+
//! },
49+
//!```
50+
//!
51+
//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the parser.
52+
//!
53+
//! ## Source code layout:
54+
//!
55+
//! The functionality of this crate is split into several modules:
56+
//!
57+
//! - [token]: This module contains the definition of the tokens that are generated by the lexer.
58+
//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
59+
//! - [parser]: This module contains an interface to the parser and is responsible for generating the AST.
60+
//! - Functions and strings have special parsing requirements that are handled in additional files.
61+
//! - [mode]: This module contains the definition of the different modes that the parser can be in.
62+
//! - [error]: This module contains the definition of the errors that can be returned by the parser.
1563
//!
64+
//! # Examples
65+
//!
66+
//! For example, to get a stream of tokens from a given string, one could do this:
67+
//!
68+
//! ```
69+
//! use rustpython_parser::lexer::make_tokenizer;
70+
//!
71+
//! let python_source = r#"
72+
//! def is_odd(i):
73+
//! return bool(i & 1)
74+
//! "#;
75+
//! let mut tokens = make_tokenizer(python_source);
76+
//! assert!(tokens.all(|t| t.is_ok()));
1677
//! ```
78+
//!
79+
//! These tokens can be directly fed into the parser to generate an AST:
80+
//!
81+
//! ```
82+
//! use rustpython_parser::parser::{parse_tokens, Mode};
83+
//! use rustpython_parser::lexer::make_tokenizer;
84+
//!
85+
//! let python_source = r#"
86+
//! def is_odd(i):
87+
//! return bool(i & 1)
88+
//! "#;
89+
//! let tokens = make_tokenizer(python_source);
90+
//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
91+
//!
92+
//! assert!(ast.is_ok());
93+
//! ```
94+
//!
95+
//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific
96+
//! mode or tokenizing the source beforehand:
97+
//!
98+
//! ```
99+
//! use rustpython_parser::parser::parse_program;
100+
//!
101+
//! let python_source = r#"
102+
//! def is_odd(i):
103+
//! return bool(i & 1)
104+
//! "#;
105+
//! let ast = parse_program(python_source, "<embedded>");
106+
//!
107+
//! assert!(ast.is_ok());
108+
//! ```
109+
//!
110+
//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
111+
//! [parsing]: https://en.wikipedia.org/wiki/Parsing
112+
//! [token]: crate::token
113+
//! [lexer]: crate::lexer
114+
//! [parser]: crate::parser
115+
//! [mode]: crate::mode
116+
//! [error]: crate::error
17117
18118
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
19119
#![doc(html_root_url = "https://docs.rs/rustpython-parser/")]

compiler/parser/src/mode.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1+
//! Control in the different modes by which a source file can be parsed.
12
use crate::token::Tok;
23

4+
/// The mode argument specifies in what way code must be parsed.
35
#[derive(Clone, Copy)]
46
pub enum Mode {
7+
/// The code consists of a sequence of statements.
58
Module,
9+
/// The code consists of a sequence of interactive statement.
610
Interactive,
11+
/// The code consists of a single expression.
712
Expression,
813
}
914

@@ -39,6 +44,7 @@ impl std::str::FromStr for Mode {
3944
}
4045
}
4146

47+
/// Returned when a given mode is not valid.
4248
#[derive(Debug)]
4349
pub struct ModeParseError {
4450
_priv: (),

0 commit comments

Comments
 (0)