From 6d56e4dc0d5310339ebdff44ed5885565cce72eb Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 16:40:37 -0400 Subject: [PATCH 1/6] v2: add lexer --- crates/squawk_lexer/Cargo.toml | 20 + crates/squawk_lexer/README.md | 3 + crates/squawk_lexer/src/LICENSE-MIT | 25 + crates/squawk_lexer/src/cursor.rs | 65 ++ crates/squawk_lexer/src/lib.rs | 735 ++++++++++++++++++ .../snapshots/lexer__tests__bitstring.snap | 15 + .../lexer__tests__block_comment.snap | 8 + ...er__tests__block_comment_unterminated.snap | 8 + ...s__dollar_quote_mismatch_tags_complex.snap | 10 + ...ts__dollar_quote_mismatch_tags_simple.snap | 10 + .../lexer__tests__dollar_quoting.snap | 14 + .../lexer__tests__lex_statement.snap | 10 + .../snapshots/lexer__tests__line_comment.snap | 9 + ...lexer__tests__line_comment_whitespace.snap | 15 + .../src/snapshots/lexer__tests__numeric.snap | 27 + .../lexer__tests__numeric_non_decimal.snap | 19 + ...lexer__tests__numeric_with_seperators.snap | 17 + .../src/snapshots/lexer__tests__params.snap | 26 + .../snapshots/lexer__tests__quoted_ident.snap | 11 + .../lexer__tests__select_with_period.snap | 14 + .../src/snapshots/lexer__tests__string.snap | 24 + .../lexer__tests__string_unicode_escape.snap | 19 + .../lexer__tests__string_with_escapes.snap | 19 + crates/squawk_lexer/src/token.rs | 155 ++++ 24 files changed, 1278 insertions(+) create mode 100644 crates/squawk_lexer/Cargo.toml create mode 100644 crates/squawk_lexer/README.md create mode 100644 crates/squawk_lexer/src/LICENSE-MIT create mode 100644 crates/squawk_lexer/src/cursor.rs create mode 100644 crates/squawk_lexer/src/lib.rs create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__params.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap create mode 100644 crates/squawk_lexer/src/token.rs diff --git a/crates/squawk_lexer/Cargo.toml b/crates/squawk_lexer/Cargo.toml new file mode 100644 index 00000000..11c75156 --- /dev/null +++ b/crates/squawk_lexer/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "lexer" +version = "0.0.0" +description = "TBD" + +authors.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true + +[lib] +doctest = false + +[dependencies] + +[dev-dependencies] +insta.workspace = true + +[lints] +workspace = true diff --git a/crates/squawk_lexer/README.md b/crates/squawk_lexer/README.md new file mode 100644 index 00000000..59a462a7 --- /dev/null +++ b/crates/squawk_lexer/README.md @@ -0,0 +1,3 @@ +# lexer + +> Adapted from the Rust lexer. diff --git a/crates/squawk_lexer/src/LICENSE-MIT b/crates/squawk_lexer/src/LICENSE-MIT new file mode 100644 index 00000000..163de68c --- /dev/null +++ b/crates/squawk_lexer/src/LICENSE-MIT @@ -0,0 +1,25 @@ +from: https://github.com/rust-lang/rust/blob/176e5452095444815207be02c16de0b1487a1b53/LICENSE-MIT + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs new file mode 100644 index 00000000..dad5d9e5 --- /dev/null +++ b/crates/squawk_lexer/src/cursor.rs @@ -0,0 +1,65 @@ +use std::str::Chars; + +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `first` method, +/// and position can be shifted forward via `bump` method. +/// based on: +/// - https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/cursor.rs +/// - https://github.com/astral-sh/ruff/blob/d1079680bb29f6b797b5df15327195300f635f3c/crates/ruff_python_parser/src/lexer/cursor.rs +/// +pub(crate) struct Cursor<'a> { + /// Iterator over chars. Slightly faster than a &str. + chars: Chars<'a>, + len_remaining: usize, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + len_remaining: input.len(), + chars: input.chars(), + } + } + + /// Peeks the next symbol from the input stream without consuming it. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + pub(crate) fn first(&self) -> char { + // `.next()` optimizes better than `.nth(0)` + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Checks if there is nothing more to consume. + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Returns amount of already consumed symbols. + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 + } + + /// Resets the number of bytes consumed to 0. + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); + } + + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + Some(c) + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs new file mode 100644 index 00000000..9ca95112 --- /dev/null +++ b/crates/squawk_lexer/src/lib.rs @@ -0,0 +1,735 @@ +mod cursor; +mod token; +use cursor::{Cursor, EOF_CHAR}; +pub use token::{Base, LiteralKind, Token, TokenKind}; + +// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 +// ident_start [A-Za-z\200-\377_] +const fn is_ident_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') +} + +// ident_cont [A-Za-z\200-\377_0-9\$] +const fn is_ident_cont(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') +} + +// see: +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 +const fn is_whitespace(c: char) -> bool { + matches!( + c, + ' ' // space + | '\t' // tab + | '\n' // newline + | '\r' // carriage return + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + ) +} + +impl Cursor<'_> { + // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 + pub(crate) fn advance_token(&mut self) -> Token { + let first_char = match self.bump() { + Some(c) => c, + None => return Token::new(TokenKind::Eof, 0), + }; + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match self.first() { + '*' => self.block_comment(), + _ => TokenKind::Slash, + }, + '-' => match self.first() { + '-' => self.line_comment(), + _ => TokenKind::Minus, + }, + + // // Whitespace sequence. + c if is_whitespace(c) => self.whitespace(), + + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + 'u' | 'U' => match self.first() { + '&' => { + self.bump(); + self.prefixed_string( + |terminated| LiteralKind::UnicodeEscStr { terminated }, + true, + ) + } + _ => self.ident_or_unknown_prefix(), + }, + + // escaped strings + 'e' | 'E' => { + self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) + } + + // bit string + 'b' | 'B' => { + self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) + } + + // hexadecimal byte string + 'x' | 'X' => { + self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) + } + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_ident_start(c) => self.ident(), + + // Numeric literal. + // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + c @ '0'..='9' => { + let literal_kind = self.number(c); + TokenKind::Literal { kind: literal_kind } + } + '.' => match self.first() { + '0'..='9' => { + let literal_kind = self.number('.'); + TokenKind::Literal { kind: literal_kind } + } + _ => TokenKind::Dot, + }, + // One-symbol tokens. + ';' => TokenKind::Semi, + ',' => TokenKind::Comma, + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '@' => TokenKind::At, + '#' => TokenKind::Pound, + '~' => TokenKind::Tilde, + '?' => TokenKind::Question, + ':' => TokenKind::Colon, + '$' => { + // Dollar quoted strings + if is_ident_start(self.first()) || self.first() == '$' { + self.dollar_quoted_string() + } else { + // Parameters + while self.first().is_ascii_digit() { + self.bump(); + } + TokenKind::Param + } + } + '`' => TokenKind::Backtick, + '=' => TokenKind::Eq, + '!' => TokenKind::Bang, + '<' => TokenKind::Lt, + '>' => TokenKind::Gt, + '&' => TokenKind::And, + '|' => TokenKind::Or, + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '^' => TokenKind::Caret, + '%' => TokenKind::Percent, + + // String literal + '\'' => { + let terminated = self.single_quoted_string(); + let kind = LiteralKind::Str { terminated }; + TokenKind::Literal { kind } + } + + // Quoted indentifiers + '"' => { + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => TokenKind::Unknown, + }; + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res + } + pub(crate) fn ident(&mut self) -> TokenKind { + self.eat_while(is_ident_cont); + TokenKind::Ident + } + + pub(crate) fn whitespace(&mut self) -> TokenKind { + self.eat_while(is_whitespace); + TokenKind::Whitespace + } + + fn ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_ident_cont); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => TokenKind::UnknownPrefix, + _ => TokenKind::Ident, + } + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 + // comment ("--"{non_newline}*) + pub(crate) fn line_comment(&mut self) -> TokenKind { + self.bump(); + + self.eat_while(|c| c != '\n'); + TokenKind::LineComment + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 + pub(crate) fn block_comment(&mut self) -> TokenKind { + self.bump(); + + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.first() == '*' => { + self.bump(); + depth += 1; + } + '*' if self.first() == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } + } + _ => (), + } + } + + TokenKind::BlockComment { + terminated: depth == 0, + } + } + + fn prefixed_string( + &mut self, + mk_kind: fn(bool) -> LiteralKind, + allows_double: bool, + ) -> TokenKind { + match self.first() { + '\'' => { + self.bump(); + let terminated = self.single_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + '"' if allows_double => { + self.bump(); + let terminated = self.double_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + _ => self.ident_or_unknown_prefix(), + } + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + match self.first() { + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 + 'b' | 'B' => { + base = Base::Binary; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 + 'o' | 'O' => { + base = Base::Octal; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 + 'x' | 'X' => { + base = Base::Hexadecimal; + self.bump(); + if !self.eat_hexadecimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { + self.eat_decimal_digits(); + } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + + // Just a 0. + _ => { + return LiteralKind::Int { + base, + empty_int: false, + } + } + } + } else { + // No base prefix, parse number in the usual way. + self.eat_decimal_digits(); + }; + + match self.first() { + '.' => { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.first().is_ascii_digit() { + self.eat_decimal_digits(); + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { + base, + empty_exponent, + } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = !self.eat_float_exponent(); + LiteralKind::Float { + base, + empty_exponent, + } + } + _ => LiteralKind::Int { + base, + empty_int: false, + }, + } + } + + fn single_quoted_string(&mut self) -> bool { + // Parse until either quotes are terminated or error is detected. + loop { + match self.first() { + // Quotes might be terminated. + '\'' => { + self.bump(); + + match self.first() { + // encountered an escaped quote '' + '\'' => { + self.bump(); + } + // encountered terminating quote + _ => return true, + } + } + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, + // Skip the character. + _ => { + self.bump(); + } + } + } + // String was not terminated. + false + } + + /// Eats double-quoted string and returns true + /// if string is terminated. + fn double_quoted_string(&mut self) -> bool { + while let Some(c) = self.bump() { + match c { + '"' => { + return true; + } + '\\' if self.first() == '\\' || self.first() == '"' => { + // Bump again to skip escaped character. + self.bump(); + } + _ => (), + } + } + // End of file reached. + false + } + + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + fn dollar_quoted_string(&mut self) -> TokenKind { + // Get the start sequence of the dollar quote, i.e., 'foo' in + // $foo$hello$foo$ + let mut start = vec![]; + while let Some(c) = self.bump() { + match c { + '$' => { + self.bump(); + break; + } + _ => { + start.push(c); + } + } + } + + if start.is_empty() { + loop { + self.eat_while(|c| c != '$'); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + // eat $ + self.bump(); + if self.first() == '$' { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: true }, + }; + } + } + } else { + loop { + self.eat_while(|c| c != start[0]); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + + // might be the start of our start/end sequence + let mut match_count = 0; + for start_char in start.iter() { + if self.first() == *start_char { + self.bump(); + match_count += 1; + } else { + self.bump(); + break; + } + } + + // closing '$' + if self.first() == '$' { + self.bump(); + let terminated = match_count == start.len(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated }, + }; + } + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { + if self.first() == '-' || self.first() == '+' { + self.bump(); + } + self.eat_decimal_digits() + } +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { + Some(token) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::*; + use insta::assert_debug_snapshot; + + struct TokenDebug<'a> { + content: &'a str, + token: Token, + } + impl fmt::Debug for TokenDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @ {:?}", self.content, self.token.kind) + } + } + + impl<'a> TokenDebug<'a> { + fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { + TokenDebug { + token, + content: &input[start as usize..(start + token.len) as usize], + } + } + } + + fn lex(input: &str) -> Vec { + let mut tokens = vec![]; + let mut start = 0; + + for token in tokenize(input) { + let length = token.len; + tokens.push(TokenDebug::new(token, input, start)); + start += length; + } + tokens + } + #[test] + fn lex_statement() { + let result = lex("select 1;"); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment() { + let result = lex(r#" +/* + * foo + * bar +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment_unterminated() { + let result = lex(r#" +/* + * foo + * bar + /* +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment() { + let result = lex(r#" +-- foooooooooooo bar buzz +"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment_whitespace() { + assert_debug_snapshot!(lex(r#" +select 'Hello' -- This is a comment +' World';"#)) + } + + #[test] + fn dollar_quoting() { + assert_debug_snapshot!(lex(r#" +$$Dianne's horse$$ +$SomeTag$Dianne's horse$SomeTag$ + +-- with dollar inside and matching tags +$foo$hello$world$bar$ +"#)) + } + + #[test] + fn dollar_quote_mismatch_tags_simple() { + assert_debug_snapshot!(lex(r#" +-- dollar quoting with mismatched tags +$foo$hello world$bar$ +"#)); + } + + #[test] + fn dollar_quote_mismatch_tags_complex() { + assert_debug_snapshot!(lex(r#" +-- with dollar inside but mismatched tags +$foo$hello$world$bar$ +"#)); + } + + #[test] + fn numeric() { + assert_debug_snapshot!(lex(r#" +42 +3.5 +4. +.001 +.123e10 +5e2 +1.925e-3 +1e-10 +1e+10 +1e10 +"#)) + } + + #[test] + fn numeric_non_decimal() { + assert_debug_snapshot!(lex(r#" +0b100101 +0B10011001 +0o273 +0O755 +0x42f +0XFFFF +"#)) + } + + #[test] + fn numeric_with_seperators() { + assert_debug_snapshot!(lex(r#" +1_500_000_000 +0b10001000_00000000 +0o_1_755 +0xFFFF_FFFF +1.618_034 +"#)) + } + + #[test] + fn select_with_period() { + assert_debug_snapshot!(lex(r#" +select public.users; +"#)) + } + + #[test] + fn bitstring() { + assert_debug_snapshot!(lex(r#" +B'1001' +b'1001' +X'1FF' +x'1FF' +"#)) + } + + #[test] + fn string() { + assert_debug_snapshot!(lex(r#" +'Dianne''s horse' + +select 'foo '' +bar'; + +select 'foooo' + 'bar'; + + +'foo \\ \n \tbar' + +'forgot to close the string +"#)) + } + + #[test] + fn params() { + assert_debug_snapshot!(lex(r#" +select $1 + $2; + +select $1123123123123; + +select $; +"#)) + } + + #[test] + fn string_with_escapes() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE + + assert_debug_snapshot!(lex(r#" +E'foo' + +e'bar' + +e'\b\f\n\r\t' + +e'\0\11\777' + +e'\x0\x11\xFF' + +e'\uAAAA \UFFFFFFFF' + +"#)) + } + + #[test] + fn string_unicode_escape() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + + assert_debug_snapshot!(lex(r#" +U&"d\0061t\+000061" + +U&"\0441\043B\043E\043D" + +u&'\0441\043B' + +U&"d!0061t!+000061" UESCAPE '!' +"#)) + } + + #[test] + fn quoted_ident() { + assert_debug_snapshot!(lex(r#" +"hello &1 -world"; + + +"hello-world +"#)) + } +} diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap new file mode 100644 index 00000000..9399cff9 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap @@ -0,0 +1,15 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nB'1001'\nb'1001'\nX'1FF'\nx'1FF'\n\"#)" +--- +[ + "\n" @ Whitespace, + "B'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ Whitespace, + "b'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ Whitespace, + "X'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ Whitespace, + "x'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap new file mode 100644 index 00000000..f7bfe460 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap @@ -0,0 +1,8 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "/*\n * foo\n * bar\n*/" @ BlockComment { terminated: true }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap new file mode 100644 index 00000000..2acb3e33 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap @@ -0,0 +1,8 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "/*\n * foo\n * bar\n /*\n*/" @ BlockComment { terminated: false }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap new file mode 100644 index 00000000..914f4770 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n-- with dollar inside but mismatched tags\n$foo$hello$world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "-- with dollar inside but mismatched tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap new file mode 100644 index 00000000..1035dd00 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n-- dollar quoting with mismatched tags\n$foo$hello world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "-- dollar quoting with mismatched tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap new file mode 100644 index 00000000..b1d86251 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap @@ -0,0 +1,14 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n$$Dianne's horse$$\n$SomeTag$Dianne's horse$SomeTag$\n\n-- with dollar inside and matching tags\n$foo$hello$world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "$$Dianne's horse$$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n" @ Whitespace, + "$SomeTag$Dianne's horse$SomeTag$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n\n" @ Whitespace, + "-- with dollar inside and matching tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap new file mode 100644 index 00000000..a76c42ba --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "select" @ Ident, + " " @ Whitespace, + "1" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + ";" @ Semi, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap new file mode 100644 index 00000000..e58ef5ff --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap @@ -0,0 +1,9 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "-- foooooooooooo bar buzz" @ LineComment, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap new file mode 100644 index 00000000..fef03086 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap @@ -0,0 +1,15 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect 'Hello' -- This is a comment\n' World';\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'Hello'" @ Literal { kind: Str { terminated: true } }, + " " @ Whitespace, + "-- This is a comment" @ LineComment, + "\n" @ Whitespace, + "' World'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap new file mode 100644 index 00000000..c831bd25 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap @@ -0,0 +1,27 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n42\n3.5\n4.\n.001\n.123e10\n5e2\n1.925e-3\n1e-10\n1e+10\n1e10\n\"#)" +--- +[ + "\n" @ Whitespace, + "42" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + "3.5" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "4." @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + ".001" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + ".123e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "5e2" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1.925e-3" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e-10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e+10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap new file mode 100644 index 00000000..5050265f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n0b100101\n0B10011001\n0o273\n0O755\n0x42f\n0XFFFF\n\"#)" +--- +[ + "\n" @ Whitespace, + "0b100101" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0B10011001" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0o273" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0O755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0x42f" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, + "0XFFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap new file mode 100644 index 00000000..46814c5f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap @@ -0,0 +1,17 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n1_500_000_000\n0b10001000_00000000\n0o_1_755\n0xFFFF_FFFF\n1.618_034\n\"#)" +--- +[ + "\n" @ Whitespace, + "1_500_000_000" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + "0b10001000_00000000" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0o_1_755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0xFFFF_FFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, + "1.618_034" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap new file mode 100644 index 00000000..1879b452 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap @@ -0,0 +1,26 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect $1 + $2;\n\nselect $1123123123123;\n\nselect $;\n\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$1" @ Param, + " " @ Whitespace, + "+" @ Plus, + " " @ Whitespace, + "$2" @ Param, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$1123123123123" @ Param, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$" @ Param, + ";" @ Semi, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap new file mode 100644 index 00000000..70f71342 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap @@ -0,0 +1,11 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n\"hello &1 -world\";\n\n\n\"hello-world\n\"#)" +--- +[ + "\n" @ Whitespace, + "\"hello &1 -world\"" @ QuotedIdent { terminated: true }, + ";" @ Semi, + "\n\n\n" @ Whitespace, + "\"hello-world\n" @ QuotedIdent { terminated: false }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap new file mode 100644 index 00000000..00ef920f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap @@ -0,0 +1,14 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect public.users;\n\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "public" @ Ident, + "." @ Dot, + "users" @ Ident, + ";" @ Semi, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap new file mode 100644 index 00000000..1022c823 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap @@ -0,0 +1,24 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n'Dianne''s horse'\n\nselect 'foo ''\nbar';\n\nselect 'foooo' \n 'bar';\n\n\n'foo \\\\ \\n \\tbar'\n\n'forgot to close the string\n\"#)" +--- +[ + "\n" @ Whitespace, + "'Dianne''s horse'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'foo ''\nbar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'foooo'" @ Literal { kind: Str { terminated: true } }, + " \n " @ Whitespace, + "'bar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n\n" @ Whitespace, + "'foo \\\\ \\n \\tbar'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ Whitespace, + "'forgot to close the string\n" @ Literal { kind: Str { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap new file mode 100644 index 00000000..b257b050 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nU&\"d\\0061t\\+000061\"\n\nU&\"\\0441\\043B\\043E\\043D\"\n\nu&'\\0441\\043B'\n\nU&\"d!0061t!+000061\" UESCAPE '!'\n\"#)" +--- +[ + "\n" @ Whitespace, + "U&\"d\\0061t\\+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "U&\"\\0441\\043B\\043E\\043D\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "u&'\\0441\\043B'" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "U&\"d!0061t!+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + " " @ Whitespace, + "UESCAPE" @ Ident, + " " @ Whitespace, + "'!'" @ Literal { kind: Str { terminated: true } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap new file mode 100644 index 00000000..67947733 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nE'foo'\n\ne'bar'\n\ne'\\b\\f\\n\\r\\t'\n\ne'\\0\\11\\777'\n\ne'\\x0\\x11\\xFF'\n\ne'\\uAAAA \\UFFFFFFFF'\n\n\"#)" +--- +[ + "\n" @ Whitespace, + "E'foo'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'bar'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\b\\f\\n\\r\\t'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\0\\11\\777'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\x0\\x11\\xFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\uAAAA \\UFFFFFFFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs new file mode 100644 index 00000000..9853f8cc --- /dev/null +++ b/crates/squawk_lexer/src/token.rs @@ -0,0 +1,155 @@ +// based on: https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/lib.rs#L58 +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum TokenKind { + /// Used when there's an error of some sort while lexing. + Unknown, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// suffix, but may be present here on string and float literals. Users of + /// this type will need to check for and reject that case. + /// + /// See [`LiteralKind`] for more details. + Literal { kind: LiteralKind }, + /// Space, tab, newline, carriage return, vertical tab, form feed + Whitespace, + /// Identifier + /// + /// case-sensitive + Ident, + /// `;` + Semi, + /// End of file + Eof, + /// `/` + Slash, + /// `-- foo` + LineComment, + /// ``` + /// /* + /// foo + /// */ + /// ``` + BlockComment { terminated: bool }, + /// `-` + Minus, + /// `:` + Colon, + /// `.` + Dot, + /// `=` + Eq, + /// `>` + Gt, + /// `&` + And, + /// `<` + Lt, + /// `!` + Bang, + /// `+` + Plus, + /// `~` + Tilde, + /// `#` + Pound, + /// `?` + Question, + /// `|` + Or, + /// `%` + Percent, + /// `^` + Caret, + /// `*` + Star, + /// `` ` `` + Backtick, + /// `@` + At, + /// `]` + CloseBracket, + /// `[` + OpenBracket, + /// `)` + CloseParen, + /// `(` + OpenParen, + /// `,` + Comma, + /// Error case that we need to report later on. + UnknownPrefix, + /// Positional Parameter, e.g., `$1` + /// + /// see: https://www.postgresql.org/docs/16/sql-expressions.html#SQL-EXPRESSIONS-PARAMETERS-POSITIONAL + Param, + /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;` + /// + /// These are case-sensitive, unlike [`TokenKind::Ident`] + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS + QuotedIdent { terminated: bool }, +} + +/// Parsed token. +/// It doesn't contain information about data that has been parsed, +/// only the type of the token and its size. +#[derive(Debug, Clone, Copy)] +pub struct Token { + pub kind: TokenKind, + pub len: u32, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, len: u32) -> Token { + Token { kind, len } + } +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary = 2, + /// Literal starts with "0o". + Octal = 8, + /// Literal doesn't contain a prefix. + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, +} + +// Enum representing the literal types supported by the lexer. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// Integer Numeric, e.g., `42` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + Int { base: Base, empty_int: bool }, + /// Float Numeric, e.g., `1.925e-3` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + Float { base: Base, empty_exponent: bool }, + /// String, e.g., `'foo'` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS + Str { terminated: bool }, + /// Hexidecimal Bit String, e.g., `X'1FF'` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS + ByteStr { terminated: bool }, + /// Bit String, e.g., `B'1001'` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS + BitStr { terminated: bool }, + /// Dollar Quoted String, e.g., `$$Dianne's horse$$` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + DollarQuotedString { terminated: bool }, + /// Unicode Escape String, e.g., `U&'d\0061t\+000061'` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + UnicodeEscStr { terminated: bool }, + /// Escape String, e.g, `E'foo'` + /// + /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html + EscStr { terminated: bool }, +} From 1e1efb7827f7895e51e6ffcf2f90f4aa4c3b726d Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 16:45:20 -0400 Subject: [PATCH 2/6] fix --- Cargo.toml | 4 ++++ crates/linter/Cargo.toml | 6 +++--- crates/squawk_lexer/Cargo.toml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bfa13eb7..2e90fad1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,9 @@ [workspace] members = ["crates/*"] +authors = ["Squawk Team & Contributors"] +edition = "2021" +license = "GPL-3.0" +rust-version = "1.81.0" [workspace.dependencies] # third party diff --git a/crates/linter/Cargo.toml b/crates/linter/Cargo.toml index 0d6eb7c6..a0cbde71 100644 --- a/crates/linter/Cargo.toml +++ b/crates/linter/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "squawk-linter" version = "0.0.0" -authors = ["Steve Dignam "] -edition = "2018" -license = "GPL-3.0" +authors.workspace = true +edition.workspace = true +license.workspace = true description = "Postgres SQL linter used in squawk" repository = "https://github.com/sbdchd/squawk" readme = "README.md" diff --git a/crates/squawk_lexer/Cargo.toml b/crates/squawk_lexer/Cargo.toml index 11c75156..699c9077 100644 --- a/crates/squawk_lexer/Cargo.toml +++ b/crates/squawk_lexer/Cargo.toml @@ -5,7 +5,7 @@ description = "TBD" authors.workspace = true edition.workspace = true -license.workspace = true +license = "MIT" rust-version.workspace = true [lib] From c4233a8acd1aaa4bb556bb601fe9dadb54f7ce83 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 16:50:06 -0400 Subject: [PATCH 3/6] fix --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 2e90fad1..e44a1c43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,7 @@ [workspace] members = ["crates/*"] + +[workspace.package] authors = ["Squawk Team & Contributors"] edition = "2021" license = "GPL-3.0" From a3c47078462153c56d37614858cfa1ecfb5ae3b9 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 16:53:01 -0400 Subject: [PATCH 4/6] fix --- Cargo.lock | 7 +++++++ Cargo.toml | 13 +++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2c00ea6..69e41799 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -749,6 +749,13 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexer" +version = "0.0.0" +dependencies = [ + "insta", +] + [[package]] name = "libc" version = "0.2.167" diff --git a/Cargo.toml b/Cargo.toml index e44a1c43..31700045 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,12 @@ [workspace] members = ["crates/*"] +resolver = "2" [workspace.package] -authors = ["Squawk Team & Contributors"] edition = "2021" -license = "GPL-3.0" rust-version = "1.81.0" +authors = ["Squawk Team & Contributors"] +license = "GPL-3.0" [workspace.dependencies] # third party @@ -33,5 +34,13 @@ squawk-parser = { version = "0.0.0", path = "./crates/parser" } squawk-linter = { version = "0.0.0", path = "./crates/linter" } squawk-github = { version = "0.0.0", path = "./crates/github" } +[workspace.lints.clippy] +collapsible_else_if = "allow" +collapsible_if = "allow" +needless_return = "allow" + +[profile.dev] +debug = 0 + [profile.dev.package] insta.opt-level = 3 From 771c36c03fb706a7a8e2e97b6680c6463ab22ec9 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 17:00:10 -0400 Subject: [PATCH 5/6] fix --- Cargo.toml | 4 ++++ crates/squawk_lexer/src/cursor.rs | 4 ++-- crates/squawk_lexer/src/lib.rs | 7 +++---- crates/squawk_lexer/src/token.rs | 20 ++++++++++---------- 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 31700045..49cd1eb8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,6 +38,10 @@ squawk-github = { version = "0.0.0", path = "./crates/github" } collapsible_else_if = "allow" collapsible_if = "allow" needless_return = "allow" +if_not_else = "allow" +needless_raw_string_hashes = "allow" +cast_possible_truncation = "allow" +semicolon_if_nothing_returned = "allow" [profile.dev] debug = 0 diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs index dad5d9e5..c70c5feb 100644 --- a/crates/squawk_lexer/src/cursor.rs +++ b/crates/squawk_lexer/src/cursor.rs @@ -5,8 +5,8 @@ use std::str::Chars; /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. /// based on: -/// - https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/cursor.rs -/// - https://github.com/astral-sh/ruff/blob/d1079680bb29f6b797b5df15327195300f635f3c/crates/ruff_python_parser/src/lexer/cursor.rs +/// - +/// - /// pub(crate) struct Cursor<'a> { /// Iterator over chars. Slightly faster than a &str. diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs index 9ca95112..dff91c6c 100644 --- a/crates/squawk_lexer/src/lib.rs +++ b/crates/squawk_lexer/src/lib.rs @@ -32,9 +32,8 @@ const fn is_whitespace(c: char) -> bool { impl Cursor<'_> { // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 pub(crate) fn advance_token(&mut self) -> Token { - let first_char = match self.bump() { - Some(c) => c, - None => return Token::new(TokenKind::Eof, 0), + let Some(first_char) = self.bump() else { + return Token::new(TokenKind::Eof, 0); }; let token_kind = match first_char { // Slash, comment or block comment. @@ -418,7 +417,7 @@ impl Cursor<'_> { // might be the start of our start/end sequence let mut match_count = 0; - for start_char in start.iter() { + for start_char in &start { if self.first() == *start_char { self.bump(); match_count += 1; diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs index 9853f8cc..5827c672 100644 --- a/crates/squawk_lexer/src/token.rs +++ b/crates/squawk_lexer/src/token.rs @@ -79,13 +79,13 @@ pub enum TokenKind { UnknownPrefix, /// Positional Parameter, e.g., `$1` /// - /// see: https://www.postgresql.org/docs/16/sql-expressions.html#SQL-EXPRESSIONS-PARAMETERS-POSITIONAL + /// see: Param, /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;` /// /// These are case-sensitive, unlike [`TokenKind::Ident`] /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS + /// see: QuotedIdent { terminated: bool }, } @@ -122,34 +122,34 @@ pub enum Base { pub enum LiteralKind { /// Integer Numeric, e.g., `42` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + /// see: Int { base: Base, empty_int: bool }, /// Float Numeric, e.g., `1.925e-3` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + /// see: Float { base: Base, empty_exponent: bool }, /// String, e.g., `'foo'` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS + /// see: Str { terminated: bool }, /// Hexidecimal Bit String, e.g., `X'1FF'` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS + /// see: ByteStr { terminated: bool }, /// Bit String, e.g., `B'1001'` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS + /// see: BitStr { terminated: bool }, /// Dollar Quoted String, e.g., `$$Dianne's horse$$` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + /// see: DollarQuotedString { terminated: bool }, /// Unicode Escape String, e.g., `U&'d\0061t\+000061'` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + /// see: UnicodeEscStr { terminated: bool }, /// Escape String, e.g, `E'foo'` /// - /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html + /// see: EscStr { terminated: bool }, } From 1cbe6b3e70fae9287e082947498a068f57478bc6 Mon Sep 17 00:00:00 2001 From: Steve Dignam Date: Sat, 3 May 2025 17:18:52 -0400 Subject: [PATCH 6/6] fix --- Cargo.toml | 6 ++---- s/lint | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 49cd1eb8..5b15e8d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,10 +38,8 @@ squawk-github = { version = "0.0.0", path = "./crates/github" } collapsible_else_if = "allow" collapsible_if = "allow" needless_return = "allow" -if_not_else = "allow" -needless_raw_string_hashes = "allow" -cast_possible_truncation = "allow" -semicolon_if_nothing_returned = "allow" +doc_markdown = "deny" +manual_let_else = "deny" [profile.dev] debug = 0 diff --git a/s/lint b/s/lint index 22eb9d2b..5782ff3d 100755 --- a/s/lint +++ b/s/lint @@ -2,4 +2,4 @@ set -eu cargo fmt -- --check -cargo clippy --all-targets --all-features -- -D clippy::pedantic +cargo clippy --all-targets --all-features