Skip to content

Commit adc2325

Browse files
authored
Merge pull request RustPython#4480 from DimitrisJim/lexer_opt
Improve lexer performance by matching early on ascii identifiers.
2 parents 5fb03b6 + 5025113 commit adc2325

File tree

1 file changed

+38
-33
lines changed

1 file changed

+38
-33
lines changed

compiler/parser/src/lexer.rs

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ use unic_ucd_ident::{is_xid_continue, is_xid_start};
1818

1919
#[derive(Clone, Copy, PartialEq, Debug, Default)]
2020
struct IndentationLevel {
21-
tabs: usize,
22-
spaces: usize,
21+
tabs: u32,
22+
spaces: u32,
2323
}
2424

2525
impl IndentationLevel {
@@ -225,7 +225,8 @@ where
225225
at_begin_of_line: true,
226226
nesting: 0,
227227
indentations: Indentations::default(),
228-
pending: Vec::new(),
228+
// Usually we have less than 5 tokens pending.
229+
pending: Vec::with_capacity(5),
229230
location: start,
230231
window: CharWindow::new(input),
231232
};
@@ -257,13 +258,13 @@ where
257258
};
258259

259260
let start_pos = self.get_pos();
260-
let mut name = String::new();
261+
let mut name = String::with_capacity(8);
261262
while self.is_identifier_continuation() {
262263
name.push(self.next_char().unwrap());
263264
}
264265
let end_pos = self.get_pos();
265266

266-
if let Some(tok) = KEYWORDS.get(name.as_str()) {
267+
if let Some(tok) = KEYWORDS.get(&name) {
267268
Ok((start_pos, tok.clone(), end_pos))
268269
} else {
269270
Ok((start_pos, Tok::Name { name }, end_pos))
@@ -464,7 +465,7 @@ where
464465
self.next_char();
465466
}
466467
let quote_char = self.next_char().unwrap();
467-
let mut string_content = String::new();
468+
let mut string_content = String::with_capacity(5);
468469

469470
// If the next two characters are also the quote character, then we have a triple-quoted
470471
// string; consume those two characters and ensure that we require a triple-quote to close
@@ -534,12 +535,15 @@ where
534535
}
535536

536537
fn is_identifier_start(&self, c: char) -> bool {
537-
c == '_' || is_xid_start(c)
538+
match c {
539+
'a'..='z' | 'A'..='Z' | '_' => true,
540+
_ => is_xid_start(c),
541+
}
538542
}
539543

540544
fn is_identifier_continuation(&self) -> bool {
541545
match self.window[0] {
542-
Some('_' | '0'..='9') => true,
546+
Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') => true,
543547
Some(c) => is_xid_continue(c),
544548
_ => false,
545549
}
@@ -564,8 +568,8 @@ where
564568
/// Given we are at the start of a line, count the number of spaces and/or tabs until the first character.
565569
fn eat_indentation(&mut self) -> Result<IndentationLevel, LexicalError> {
566570
// Determine indentation:
567-
let mut spaces: usize = 0;
568-
let mut tabs: usize = 0;
571+
let mut spaces: u32 = 0;
572+
let mut tabs: u32 = 0;
569573
loop {
570574
match self.window[0] {
571575
Some(' ') => {
@@ -686,21 +690,9 @@ where
686690
fn consume_normal(&mut self) -> Result<(), LexicalError> {
687691
// Check if we have some character:
688692
if let Some(c) = self.window[0] {
689-
// First check identifier:
690693
if self.is_identifier_start(c) {
691694
let identifier = self.lex_identifier()?;
692695
self.emit(identifier);
693-
} else if is_emoji_presentation(c) {
694-
let tok_start = self.get_pos();
695-
self.next_char();
696-
let tok_end = self.get_pos();
697-
self.emit((
698-
tok_start,
699-
Tok::Name {
700-
name: c.to_string(),
701-
},
702-
tok_end,
703-
));
704696
} else {
705697
self.consume_character(c)?;
706698
}
@@ -1047,10 +1039,7 @@ where
10471039
}
10481040
}
10491041
',' => {
1050-
let tok_start = self.get_pos();
1051-
self.next_char();
1052-
let tok_end = self.get_pos();
1053-
self.emit((tok_start, Tok::Comma, tok_end));
1042+
self.eat_single_char(Tok::Comma);
10541043
}
10551044
'.' => {
10561045
if let Some('0'..='9') = self.window[1] {
@@ -1109,13 +1098,25 @@ where
11091098
});
11101099
}
11111100
}
1112-
11131101
_ => {
1114-
let c = self.next_char();
1115-
return Err(LexicalError {
1116-
error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
1117-
location: self.get_pos(),
1118-
});
1102+
if is_emoji_presentation(c) {
1103+
let tok_start = self.get_pos();
1104+
self.next_char();
1105+
let tok_end = self.get_pos();
1106+
self.emit((
1107+
tok_start,
1108+
Tok::Name {
1109+
name: c.to_string(),
1110+
},
1111+
tok_end,
1112+
));
1113+
} else {
1114+
let c = self.next_char();
1115+
return Err(LexicalError {
1116+
error: LexicalErrorType::UnrecognizedToken { tok: c.unwrap() },
1117+
location: self.get_pos(),
1118+
});
1119+
}
11191120
} // Ignore all the rest..
11201121
}
11211122

@@ -1124,7 +1125,11 @@ where
11241125

11251126
fn eat_single_char(&mut self, ty: Tok) {
11261127
let tok_start = self.get_pos();
1127-
self.next_char().unwrap();
1128+
self.next_char().unwrap_or_else(|| unsafe {
1129+
// SAFETY: eat_single_char has been called only after a character has been read
1130+
// from the window, so the window is guaranteed to be non-empty.
1131+
std::hint::unreachable_unchecked()
1132+
});
11281133
let tok_end = self.get_pos();
11291134
self.emit((tok_start, ty, tok_end));
11301135
}

0 commit comments

Comments
 (0)