diff --git a/Cargo.lock b/Cargo.lock index b2c00ea6..69e41799 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -749,6 +749,13 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" +[[package]] +name = "lexer" +version = "0.0.0" +dependencies = [ + "insta", +] + [[package]] name = "libc" version = "0.2.167" diff --git a/Cargo.toml b/Cargo.toml index bfa13eb7..5b15e8d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,12 @@ [workspace] members = ["crates/*"] +resolver = "2" + +[workspace.package] +edition = "2021" +rust-version = "1.81.0" +authors = ["Squawk Team & Contributors"] +license = "GPL-3.0" [workspace.dependencies] # third party @@ -27,5 +34,15 @@ squawk-parser = { version = "0.0.0", path = "./crates/parser" } squawk-linter = { version = "0.0.0", path = "./crates/linter" } squawk-github = { version = "0.0.0", path = "./crates/github" } +[workspace.lints.clippy] +collapsible_else_if = "allow" +collapsible_if = "allow" +needless_return = "allow" +doc_markdown = "deny" +manual_let_else = "deny" + +[profile.dev] +debug = 0 + [profile.dev.package] insta.opt-level = 3 diff --git a/crates/linter/Cargo.toml b/crates/linter/Cargo.toml index 0d6eb7c6..a0cbde71 100644 --- a/crates/linter/Cargo.toml +++ b/crates/linter/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "squawk-linter" version = "0.0.0" -authors = ["Steve Dignam "] -edition = "2018" -license = "GPL-3.0" +authors.workspace = true +edition.workspace = true +license.workspace = true description = "Postgres SQL linter used in squawk" repository = "https://github.com/sbdchd/squawk" readme = "README.md" diff --git a/crates/squawk_lexer/Cargo.toml b/crates/squawk_lexer/Cargo.toml new file mode 100644 index 00000000..699c9077 --- /dev/null +++ b/crates/squawk_lexer/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "lexer" +version = "0.0.0" +description = "TBD" + +authors.workspace = true +edition.workspace = true +license = "MIT" +rust-version.workspace = true + +[lib] +doctest = false + +[dependencies] + +[dev-dependencies] +insta.workspace = true + +[lints] +workspace = true diff --git a/crates/squawk_lexer/README.md b/crates/squawk_lexer/README.md new file mode 100644 index 00000000..59a462a7 --- /dev/null +++ b/crates/squawk_lexer/README.md @@ -0,0 +1,3 @@ +# lexer + +> Adapted from the Rust lexer. diff --git a/crates/squawk_lexer/src/LICENSE-MIT b/crates/squawk_lexer/src/LICENSE-MIT new file mode 100644 index 00000000..163de68c --- /dev/null +++ b/crates/squawk_lexer/src/LICENSE-MIT @@ -0,0 +1,25 @@ +from: https://github.com/rust-lang/rust/blob/176e5452095444815207be02c16de0b1487a1b53/LICENSE-MIT + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs new file mode 100644 index 00000000..c70c5feb --- /dev/null +++ b/crates/squawk_lexer/src/cursor.rs @@ -0,0 +1,65 @@ +use std::str::Chars; + +/// Peekable iterator over a char sequence. +/// +/// Next characters can be peeked via `first` method, +/// and position can be shifted forward via `bump` method. +/// based on: +/// - +/// - +/// +pub(crate) struct Cursor<'a> { + /// Iterator over chars. Slightly faster than a &str. + chars: Chars<'a>, + len_remaining: usize, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + len_remaining: input.len(), + chars: input.chars(), + } + } + + /// Peeks the next symbol from the input stream without consuming it. + /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// However, getting `EOF_CHAR` doesn't always mean actual end of file, + /// it should be checked with `is_eof` method. + pub(crate) fn first(&self) -> char { + // `.next()` optimizes better than `.nth(0)` + self.chars.clone().next().unwrap_or(EOF_CHAR) + } + + /// Checks if there is nothing more to consume. + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + + /// Returns amount of already consumed symbols. + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 + } + + /// Resets the number of bytes consumed to 0. + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); + } + + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option { + let c = self.chars.next()?; + Some(c) + } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + // It was tried making optimized version of this for eg. line comments, but + // LLVM can inline all of this and compile it down to fast iteration over bytes. + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } +} diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs new file mode 100644 index 00000000..dff91c6c --- /dev/null +++ b/crates/squawk_lexer/src/lib.rs @@ -0,0 +1,734 @@ +mod cursor; +mod token; +use cursor::{Cursor, EOF_CHAR}; +pub use token::{Base, LiteralKind, Token, TokenKind}; + +// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346 +// ident_start [A-Za-z\200-\377_] +const fn is_ident_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}') +} + +// ident_cont [A-Za-z\200-\377_0-9\$] +const fn is_ident_cont(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}') +} + +// see: +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128 +// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229 +const fn is_whitespace(c: char) -> bool { + matches!( + c, + ' ' // space + | '\t' // tab + | '\n' // newline + | '\r' // carriage return + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + ) +} + +impl Cursor<'_> { + // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339 + pub(crate) fn advance_token(&mut self) -> Token { + let Some(first_char) = self.bump() else { + return Token::new(TokenKind::Eof, 0); + }; + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match self.first() { + '*' => self.block_comment(), + _ => TokenKind::Slash, + }, + '-' => match self.first() { + '-' => self.line_comment(), + _ => TokenKind::Minus, + }, + + // // Whitespace sequence. + c if is_whitespace(c) => self.whitespace(), + + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + 'u' | 'U' => match self.first() { + '&' => { + self.bump(); + self.prefixed_string( + |terminated| LiteralKind::UnicodeEscStr { terminated }, + true, + ) + } + _ => self.ident_or_unknown_prefix(), + }, + + // escaped strings + 'e' | 'E' => { + self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false) + } + + // bit string + 'b' | 'B' => { + self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false) + } + + // hexadecimal byte string + 'x' | 'X' => { + self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false) + } + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_ident_start(c) => self.ident(), + + // Numeric literal. + // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC + c @ '0'..='9' => { + let literal_kind = self.number(c); + TokenKind::Literal { kind: literal_kind } + } + '.' => match self.first() { + '0'..='9' => { + let literal_kind = self.number('.'); + TokenKind::Literal { kind: literal_kind } + } + _ => TokenKind::Dot, + }, + // One-symbol tokens. + ';' => TokenKind::Semi, + ',' => TokenKind::Comma, + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + '[' => TokenKind::OpenBracket, + ']' => TokenKind::CloseBracket, + '@' => TokenKind::At, + '#' => TokenKind::Pound, + '~' => TokenKind::Tilde, + '?' => TokenKind::Question, + ':' => TokenKind::Colon, + '$' => { + // Dollar quoted strings + if is_ident_start(self.first()) || self.first() == '$' { + self.dollar_quoted_string() + } else { + // Parameters + while self.first().is_ascii_digit() { + self.bump(); + } + TokenKind::Param + } + } + '`' => TokenKind::Backtick, + '=' => TokenKind::Eq, + '!' => TokenKind::Bang, + '<' => TokenKind::Lt, + '>' => TokenKind::Gt, + '&' => TokenKind::And, + '|' => TokenKind::Or, + '+' => TokenKind::Plus, + '*' => TokenKind::Star, + '^' => TokenKind::Caret, + '%' => TokenKind::Percent, + + // String literal + '\'' => { + let terminated = self.single_quoted_string(); + let kind = LiteralKind::Str { terminated }; + TokenKind::Literal { kind } + } + + // Quoted indentifiers + '"' => { + let terminated = self.double_quoted_string(); + TokenKind::QuotedIdent { terminated } + } + _ => TokenKind::Unknown, + }; + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res + } + pub(crate) fn ident(&mut self) -> TokenKind { + self.eat_while(is_ident_cont); + TokenKind::Ident + } + + pub(crate) fn whitespace(&mut self) -> TokenKind { + self.eat_while(is_whitespace); + TokenKind::Whitespace + } + + fn ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(is_ident_cont); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => TokenKind::UnknownPrefix, + _ => TokenKind::Ident, + } + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227 + // comment ("--"{non_newline}*) + pub(crate) fn line_comment(&mut self) -> TokenKind { + self.bump(); + + self.eat_while(|c| c != '\n'); + TokenKind::LineComment + } + + // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344 + pub(crate) fn block_comment(&mut self) -> TokenKind { + self.bump(); + + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.first() == '*' => { + self.bump(); + depth += 1; + } + '*' if self.first() == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; + } + } + _ => (), + } + } + + TokenKind::BlockComment { + terminated: depth == 0, + } + } + + fn prefixed_string( + &mut self, + mk_kind: fn(bool) -> LiteralKind, + allows_double: bool, + ) -> TokenKind { + match self.first() { + '\'' => { + self.bump(); + let terminated = self.single_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + '"' if allows_double => { + self.bump(); + let terminated = self.double_quoted_string(); + let kind = mk_kind(terminated); + TokenKind::Literal { kind } + } + _ => self.ident_or_unknown_prefix(), + } + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + match self.first() { + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403 + 'b' | 'B' => { + base = Base::Binary; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402 + 'o' | 'O' => { + base = Base::Octal; + self.bump(); + if !self.eat_decimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401 + 'x' | 'X' => { + base = Base::Hexadecimal; + self.bump(); + if !self.eat_hexadecimal_digits() { + return LiteralKind::Int { + base, + empty_int: true, + }; + } + } + // Not a base prefix; consume additional digits. + '0'..='9' | '_' => { + self.eat_decimal_digits(); + } + + // Also not a base prefix; nothing more to do here. + '.' | 'e' | 'E' => {} + + // Just a 0. + _ => { + return LiteralKind::Int { + base, + empty_int: false, + } + } + } + } else { + // No base prefix, parse number in the usual way. + self.eat_decimal_digits(); + }; + + match self.first() { + '.' => { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.first().is_ascii_digit() { + self.eat_decimal_digits(); + match self.first() { + 'e' | 'E' => { + self.bump(); + empty_exponent = !self.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { + base, + empty_exponent, + } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = !self.eat_float_exponent(); + LiteralKind::Float { + base, + empty_exponent, + } + } + _ => LiteralKind::Int { + base, + empty_int: false, + }, + } + } + + fn single_quoted_string(&mut self) -> bool { + // Parse until either quotes are terminated or error is detected. + loop { + match self.first() { + // Quotes might be terminated. + '\'' => { + self.bump(); + + match self.first() { + // encountered an escaped quote '' + '\'' => { + self.bump(); + } + // encountered terminating quote + _ => return true, + } + } + // End of file, stop parsing. + EOF_CHAR if self.is_eof() => break, + // Skip the character. + _ => { + self.bump(); + } + } + } + // String was not terminated. + false + } + + /// Eats double-quoted string and returns true + /// if string is terminated. + fn double_quoted_string(&mut self) -> bool { + while let Some(c) = self.bump() { + match c { + '"' => { + return true; + } + '\\' if self.first() == '\\' || self.first() == '"' => { + // Bump again to skip escaped character. + self.bump(); + } + _ => (), + } + } + // End of file reached. + false + } + + // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING + fn dollar_quoted_string(&mut self) -> TokenKind { + // Get the start sequence of the dollar quote, i.e., 'foo' in + // $foo$hello$foo$ + let mut start = vec![]; + while let Some(c) = self.bump() { + match c { + '$' => { + self.bump(); + break; + } + _ => { + start.push(c); + } + } + } + + if start.is_empty() { + loop { + self.eat_while(|c| c != '$'); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + // eat $ + self.bump(); + if self.first() == '$' { + self.bump(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: true }, + }; + } + } + } else { + loop { + self.eat_while(|c| c != start[0]); + if self.is_eof() { + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated: false }, + }; + } + + // might be the start of our start/end sequence + let mut match_count = 0; + for start_char in &start { + if self.first() == *start_char { + self.bump(); + match_count += 1; + } else { + self.bump(); + break; + } + } + + // closing '$' + if self.first() == '$' { + self.bump(); + let terminated = match_count == start.len(); + return TokenKind::Literal { + kind: LiteralKind::DollarQuotedString { terminated }, + }; + } + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.first() { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + /// Eats the float exponent. Returns true if at least one digit was met, + /// and returns false otherwise. + fn eat_float_exponent(&mut self) -> bool { + if self.first() == '-' || self.first() == '+' { + self.bump(); + } + self.eat_decimal_digits() + } +} + +/// Creates an iterator that produces tokens from the input string. +pub fn tokenize(input: &str) -> impl Iterator + '_ { + let mut cursor = Cursor::new(input); + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { + Some(token) + } else { + None + } + }) +} + +#[cfg(test)] +mod tests { + use std::fmt; + + use super::*; + use insta::assert_debug_snapshot; + + struct TokenDebug<'a> { + content: &'a str, + token: Token, + } + impl fmt::Debug for TokenDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:?} @ {:?}", self.content, self.token.kind) + } + } + + impl<'a> TokenDebug<'a> { + fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> { + TokenDebug { + token, + content: &input[start as usize..(start + token.len) as usize], + } + } + } + + fn lex(input: &str) -> Vec { + let mut tokens = vec![]; + let mut start = 0; + + for token in tokenize(input) { + let length = token.len; + tokens.push(TokenDebug::new(token, input, start)); + start += length; + } + tokens + } + #[test] + fn lex_statement() { + let result = lex("select 1;"); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment() { + let result = lex(r#" +/* + * foo + * bar +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn block_comment_unterminated() { + let result = lex(r#" +/* + * foo + * bar + /* +*/"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment() { + let result = lex(r#" +-- foooooooooooo bar buzz +"#); + assert_debug_snapshot!(result); + } + + #[test] + fn line_comment_whitespace() { + assert_debug_snapshot!(lex(r#" +select 'Hello' -- This is a comment +' World';"#)) + } + + #[test] + fn dollar_quoting() { + assert_debug_snapshot!(lex(r#" +$$Dianne's horse$$ +$SomeTag$Dianne's horse$SomeTag$ + +-- with dollar inside and matching tags +$foo$hello$world$bar$ +"#)) + } + + #[test] + fn dollar_quote_mismatch_tags_simple() { + assert_debug_snapshot!(lex(r#" +-- dollar quoting with mismatched tags +$foo$hello world$bar$ +"#)); + } + + #[test] + fn dollar_quote_mismatch_tags_complex() { + assert_debug_snapshot!(lex(r#" +-- with dollar inside but mismatched tags +$foo$hello$world$bar$ +"#)); + } + + #[test] + fn numeric() { + assert_debug_snapshot!(lex(r#" +42 +3.5 +4. +.001 +.123e10 +5e2 +1.925e-3 +1e-10 +1e+10 +1e10 +"#)) + } + + #[test] + fn numeric_non_decimal() { + assert_debug_snapshot!(lex(r#" +0b100101 +0B10011001 +0o273 +0O755 +0x42f +0XFFFF +"#)) + } + + #[test] + fn numeric_with_seperators() { + assert_debug_snapshot!(lex(r#" +1_500_000_000 +0b10001000_00000000 +0o_1_755 +0xFFFF_FFFF +1.618_034 +"#)) + } + + #[test] + fn select_with_period() { + assert_debug_snapshot!(lex(r#" +select public.users; +"#)) + } + + #[test] + fn bitstring() { + assert_debug_snapshot!(lex(r#" +B'1001' +b'1001' +X'1FF' +x'1FF' +"#)) + } + + #[test] + fn string() { + assert_debug_snapshot!(lex(r#" +'Dianne''s horse' + +select 'foo '' +bar'; + +select 'foooo' + 'bar'; + + +'foo \\ \n \tbar' + +'forgot to close the string +"#)) + } + + #[test] + fn params() { + assert_debug_snapshot!(lex(r#" +select $1 + $2; + +select $1123123123123; + +select $; +"#)) + } + + #[test] + fn string_with_escapes() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE + + assert_debug_snapshot!(lex(r#" +E'foo' + +e'bar' + +e'\b\f\n\r\t' + +e'\0\11\777' + +e'\x0\x11\xFF' + +e'\uAAAA \UFFFFFFFF' + +"#)) + } + + #[test] + fn string_unicode_escape() { + // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE + + assert_debug_snapshot!(lex(r#" +U&"d\0061t\+000061" + +U&"\0441\043B\043E\043D" + +u&'\0441\043B' + +U&"d!0061t!+000061" UESCAPE '!' +"#)) + } + + #[test] + fn quoted_ident() { + assert_debug_snapshot!(lex(r#" +"hello &1 -world"; + + +"hello-world +"#)) + } +} diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap new file mode 100644 index 00000000..9399cff9 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap @@ -0,0 +1,15 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nB'1001'\nb'1001'\nX'1FF'\nx'1FF'\n\"#)" +--- +[ + "\n" @ Whitespace, + "B'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ Whitespace, + "b'1001'" @ Literal { kind: BitStr { terminated: true } }, + "\n" @ Whitespace, + "X'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ Whitespace, + "x'1FF'" @ Literal { kind: ByteStr { terminated: true } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap new file mode 100644 index 00000000..f7bfe460 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap @@ -0,0 +1,8 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "/*\n * foo\n * bar\n*/" @ BlockComment { terminated: true }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap new file mode 100644 index 00000000..2acb3e33 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap @@ -0,0 +1,8 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "/*\n * foo\n * bar\n /*\n*/" @ BlockComment { terminated: false }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap new file mode 100644 index 00000000..914f4770 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n-- with dollar inside but mismatched tags\n$foo$hello$world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "-- with dollar inside but mismatched tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap new file mode 100644 index 00000000..1035dd00 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n-- dollar quoting with mismatched tags\n$foo$hello world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "-- dollar quoting with mismatched tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap new file mode 100644 index 00000000..b1d86251 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap @@ -0,0 +1,14 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n$$Dianne's horse$$\n$SomeTag$Dianne's horse$SomeTag$\n\n-- with dollar inside and matching tags\n$foo$hello$world$bar$\n\"#)" +--- +[ + "\n" @ Whitespace, + "$$Dianne's horse$$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n" @ Whitespace, + "$SomeTag$Dianne's horse$SomeTag$" @ Literal { kind: DollarQuotedString { terminated: true } }, + "\n\n" @ Whitespace, + "-- with dollar inside and matching tags" @ LineComment, + "\n" @ Whitespace, + "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap new file mode 100644 index 00000000..a76c42ba --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap @@ -0,0 +1,10 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "select" @ Ident, + " " @ Whitespace, + "1" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + ";" @ Semi, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap new file mode 100644 index 00000000..e58ef5ff --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap @@ -0,0 +1,9 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: result +--- +[ + "\n" @ Whitespace, + "-- foooooooooooo bar buzz" @ LineComment, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap new file mode 100644 index 00000000..fef03086 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap @@ -0,0 +1,15 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect 'Hello' -- This is a comment\n' World';\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'Hello'" @ Literal { kind: Str { terminated: true } }, + " " @ Whitespace, + "-- This is a comment" @ LineComment, + "\n" @ Whitespace, + "' World'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap new file mode 100644 index 00000000..c831bd25 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap @@ -0,0 +1,27 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n42\n3.5\n4.\n.001\n.123e10\n5e2\n1.925e-3\n1e-10\n1e+10\n1e10\n\"#)" +--- +[ + "\n" @ Whitespace, + "42" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + "3.5" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "4." @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + ".001" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + ".123e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "5e2" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1.925e-3" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e-10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e+10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, + "1e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap new file mode 100644 index 00000000..5050265f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n0b100101\n0B10011001\n0o273\n0O755\n0x42f\n0XFFFF\n\"#)" +--- +[ + "\n" @ Whitespace, + "0b100101" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0B10011001" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0o273" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0O755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0x42f" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, + "0XFFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap new file mode 100644 index 00000000..46814c5f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap @@ -0,0 +1,17 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n1_500_000_000\n0b10001000_00000000\n0o_1_755\n0xFFFF_FFFF\n1.618_034\n\"#)" +--- +[ + "\n" @ Whitespace, + "1_500_000_000" @ Literal { kind: Int { base: Decimal, empty_int: false } }, + "\n" @ Whitespace, + "0b10001000_00000000" @ Literal { kind: Int { base: Binary, empty_int: false } }, + "\n" @ Whitespace, + "0o_1_755" @ Literal { kind: Int { base: Octal, empty_int: false } }, + "\n" @ Whitespace, + "0xFFFF_FFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } }, + "\n" @ Whitespace, + "1.618_034" @ Literal { kind: Float { base: Decimal, empty_exponent: false } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap new file mode 100644 index 00000000..1879b452 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap @@ -0,0 +1,26 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect $1 + $2;\n\nselect $1123123123123;\n\nselect $;\n\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$1" @ Param, + " " @ Whitespace, + "+" @ Plus, + " " @ Whitespace, + "$2" @ Param, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$1123123123123" @ Param, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "$" @ Param, + ";" @ Semi, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap new file mode 100644 index 00000000..70f71342 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap @@ -0,0 +1,11 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n\"hello &1 -world\";\n\n\n\"hello-world\n\"#)" +--- +[ + "\n" @ Whitespace, + "\"hello &1 -world\"" @ QuotedIdent { terminated: true }, + ";" @ Semi, + "\n\n\n" @ Whitespace, + "\"hello-world\n" @ QuotedIdent { terminated: false }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap new file mode 100644 index 00000000..00ef920f --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap @@ -0,0 +1,14 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nselect public.users;\n\"#)" +--- +[ + "\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "public" @ Ident, + "." @ Dot, + "users" @ Ident, + ";" @ Semi, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap new file mode 100644 index 00000000..1022c823 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap @@ -0,0 +1,24 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\n'Dianne''s horse'\n\nselect 'foo ''\nbar';\n\nselect 'foooo' \n 'bar';\n\n\n'foo \\\\ \\n \\tbar'\n\n'forgot to close the string\n\"#)" +--- +[ + "\n" @ Whitespace, + "'Dianne''s horse'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'foo ''\nbar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n" @ Whitespace, + "select" @ Ident, + " " @ Whitespace, + "'foooo'" @ Literal { kind: Str { terminated: true } }, + " \n " @ Whitespace, + "'bar'" @ Literal { kind: Str { terminated: true } }, + ";" @ Semi, + "\n\n\n" @ Whitespace, + "'foo \\\\ \\n \\tbar'" @ Literal { kind: Str { terminated: true } }, + "\n\n" @ Whitespace, + "'forgot to close the string\n" @ Literal { kind: Str { terminated: false } }, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap new file mode 100644 index 00000000..b257b050 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nU&\"d\\0061t\\+000061\"\n\nU&\"\\0441\\043B\\043E\\043D\"\n\nu&'\\0441\\043B'\n\nU&\"d!0061t!+000061\" UESCAPE '!'\n\"#)" +--- +[ + "\n" @ Whitespace, + "U&\"d\\0061t\\+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "U&\"\\0441\\043B\\043E\\043D\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "u&'\\0441\\043B'" @ Literal { kind: UnicodeEscStr { terminated: true } }, + "\n\n" @ Whitespace, + "U&\"d!0061t!+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } }, + " " @ Whitespace, + "UESCAPE" @ Ident, + " " @ Whitespace, + "'!'" @ Literal { kind: Str { terminated: true } }, + "\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap new file mode 100644 index 00000000..67947733 --- /dev/null +++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap @@ -0,0 +1,19 @@ +--- +source: crates/squawk_lexer/src/lib.rs +expression: "lex(r#\"\nE'foo'\n\ne'bar'\n\ne'\\b\\f\\n\\r\\t'\n\ne'\\0\\11\\777'\n\ne'\\x0\\x11\\xFF'\n\ne'\\uAAAA \\UFFFFFFFF'\n\n\"#)" +--- +[ + "\n" @ Whitespace, + "E'foo'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'bar'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\b\\f\\n\\r\\t'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\0\\11\\777'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\x0\\x11\\xFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, + "e'\\uAAAA \\UFFFFFFFF'" @ Literal { kind: EscStr { terminated: true } }, + "\n\n" @ Whitespace, +] diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs new file mode 100644 index 00000000..5827c672 --- /dev/null +++ b/crates/squawk_lexer/src/token.rs @@ -0,0 +1,155 @@ +// based on: https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/lib.rs#L58 +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum TokenKind { + /// Used when there's an error of some sort while lexing. + Unknown, + /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid + /// suffix, but may be present here on string and float literals. Users of + /// this type will need to check for and reject that case. + /// + /// See [`LiteralKind`] for more details. + Literal { kind: LiteralKind }, + /// Space, tab, newline, carriage return, vertical tab, form feed + Whitespace, + /// Identifier + /// + /// case-sensitive + Ident, + /// `;` + Semi, + /// End of file + Eof, + /// `/` + Slash, + /// `-- foo` + LineComment, + /// ``` + /// /* + /// foo + /// */ + /// ``` + BlockComment { terminated: bool }, + /// `-` + Minus, + /// `:` + Colon, + /// `.` + Dot, + /// `=` + Eq, + /// `>` + Gt, + /// `&` + And, + /// `<` + Lt, + /// `!` + Bang, + /// `+` + Plus, + /// `~` + Tilde, + /// `#` + Pound, + /// `?` + Question, + /// `|` + Or, + /// `%` + Percent, + /// `^` + Caret, + /// `*` + Star, + /// `` ` `` + Backtick, + /// `@` + At, + /// `]` + CloseBracket, + /// `[` + OpenBracket, + /// `)` + CloseParen, + /// `(` + OpenParen, + /// `,` + Comma, + /// Error case that we need to report later on. + UnknownPrefix, + /// Positional Parameter, e.g., `$1` + /// + /// see: + Param, + /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;` + /// + /// These are case-sensitive, unlike [`TokenKind::Ident`] + /// + /// see: + QuotedIdent { terminated: bool }, +} + +/// Parsed token. +/// It doesn't contain information about data that has been parsed, +/// only the type of the token and its size. +#[derive(Debug, Clone, Copy)] +pub struct Token { + pub kind: TokenKind, + pub len: u32, +} + +impl Token { + pub(crate) fn new(kind: TokenKind, len: u32) -> Token { + Token { kind, len } + } +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary = 2, + /// Literal starts with "0o". + Octal = 8, + /// Literal doesn't contain a prefix. + Decimal = 10, + /// Literal starts with "0x". + Hexadecimal = 16, +} + +// Enum representing the literal types supported by the lexer. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// Integer Numeric, e.g., `42` + /// + /// see: + Int { base: Base, empty_int: bool }, + /// Float Numeric, e.g., `1.925e-3` + /// + /// see: + Float { base: Base, empty_exponent: bool }, + /// String, e.g., `'foo'` + /// + /// see: + Str { terminated: bool }, + /// Hexidecimal Bit String, e.g., `X'1FF'` + /// + /// see: + ByteStr { terminated: bool }, + /// Bit String, e.g., `B'1001'` + /// + /// see: + BitStr { terminated: bool }, + /// Dollar Quoted String, e.g., `$$Dianne's horse$$` + /// + /// see: + DollarQuotedString { terminated: bool }, + /// Unicode Escape String, e.g., `U&'d\0061t\+000061'` + /// + /// see: + UnicodeEscStr { terminated: bool }, + /// Escape String, e.g, `E'foo'` + /// + /// see: + EscStr { terminated: bool }, +} diff --git a/s/lint b/s/lint index 22eb9d2b..5782ff3d 100755 --- a/s/lint +++ b/s/lint @@ -2,4 +2,4 @@ set -eu cargo fmt -- --check -cargo clippy --all-targets --all-features -- -D clippy::pedantic +cargo clippy --all-targets --all-features