From 6d56e4dc0d5310339ebdff44ed5885565cce72eb Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 16:40:37 -0400
Subject: [PATCH 1/6] v2: add lexer

---
 crates/squawk_lexer/Cargo.toml                |  20 +
 crates/squawk_lexer/README.md                 |   3 +
 crates/squawk_lexer/src/LICENSE-MIT           |  25 +
 crates/squawk_lexer/src/cursor.rs             |  65 ++
 crates/squawk_lexer/src/lib.rs                | 735 ++++++++++++++++++
 .../snapshots/lexer__tests__bitstring.snap    |  15 +
 .../lexer__tests__block_comment.snap          |   8 +
 ...er__tests__block_comment_unterminated.snap |   8 +
 ...s__dollar_quote_mismatch_tags_complex.snap |  10 +
 ...ts__dollar_quote_mismatch_tags_simple.snap |  10 +
 .../lexer__tests__dollar_quoting.snap         |  14 +
 .../lexer__tests__lex_statement.snap          |  10 +
 .../snapshots/lexer__tests__line_comment.snap |   9 +
 ...lexer__tests__line_comment_whitespace.snap |  15 +
 .../src/snapshots/lexer__tests__numeric.snap  |  27 +
 .../lexer__tests__numeric_non_decimal.snap    |  19 +
 ...lexer__tests__numeric_with_seperators.snap |  17 +
 .../src/snapshots/lexer__tests__params.snap   |  26 +
 .../snapshots/lexer__tests__quoted_ident.snap |  11 +
 .../lexer__tests__select_with_period.snap     |  14 +
 .../src/snapshots/lexer__tests__string.snap   |  24 +
 .../lexer__tests__string_unicode_escape.snap  |  19 +
 .../lexer__tests__string_with_escapes.snap    |  19 +
 crates/squawk_lexer/src/token.rs              | 155 ++++
 24 files changed, 1278 insertions(+)
 create mode 100644 crates/squawk_lexer/Cargo.toml
 create mode 100644 crates/squawk_lexer/README.md
 create mode 100644 crates/squawk_lexer/src/LICENSE-MIT
 create mode 100644 crates/squawk_lexer/src/cursor.rs
 create mode 100644 crates/squawk_lexer/src/lib.rs
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__params.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap
 create mode 100644 crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap
 create mode 100644 crates/squawk_lexer/src/token.rs

diff --git a/crates/squawk_lexer/Cargo.toml b/crates/squawk_lexer/Cargo.toml
new file mode 100644
index 00000000..11c75156
--- /dev/null
+++ b/crates/squawk_lexer/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "lexer"
+version = "0.0.0"
+description = "TBD"
+
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+rust-version.workspace = true
+
+[lib]
+doctest = false
+
+[dependencies]
+
+[dev-dependencies]
+insta.workspace = true
+
+[lints]
+workspace = true
diff --git a/crates/squawk_lexer/README.md b/crates/squawk_lexer/README.md
new file mode 100644
index 00000000..59a462a7
--- /dev/null
+++ b/crates/squawk_lexer/README.md
@@ -0,0 +1,3 @@
+# lexer
+
+> Adapted from the Rust lexer.
diff --git a/crates/squawk_lexer/src/LICENSE-MIT b/crates/squawk_lexer/src/LICENSE-MIT
new file mode 100644
index 00000000..163de68c
--- /dev/null
+++ b/crates/squawk_lexer/src/LICENSE-MIT
@@ -0,0 +1,25 @@
+from: https://github.com/rust-lang/rust/blob/176e5452095444815207be02c16de0b1487a1b53/LICENSE-MIT
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs
new file mode 100644
index 00000000..dad5d9e5
--- /dev/null
+++ b/crates/squawk_lexer/src/cursor.rs
@@ -0,0 +1,65 @@
+use std::str::Chars;
+
+/// Peekable iterator over a char sequence.
+///
+/// Next characters can be peeked via `first` method,
+/// and position can be shifted forward via `bump` method.
+/// based on:
+/// - https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/cursor.rs
+/// - https://github.com/astral-sh/ruff/blob/d1079680bb29f6b797b5df15327195300f635f3c/crates/ruff_python_parser/src/lexer/cursor.rs
+///
+pub(crate) struct Cursor<'a> {
+    /// Iterator over chars. Slightly faster than a &str.
+    chars: Chars<'a>,
+    len_remaining: usize,
+}
+
+pub(crate) const EOF_CHAR: char = '\0';
+
+impl<'a> Cursor<'a> {
+    pub(crate) fn new(input: &'a str) -> Cursor<'a> {
+        Cursor {
+            len_remaining: input.len(),
+            chars: input.chars(),
+        }
+    }
+
+    /// Peeks the next symbol from the input stream without consuming it.
+    /// If requested position doesn't exist, `EOF_CHAR` is returned.
+    /// However, getting `EOF_CHAR` doesn't always mean actual end of file,
+    /// it should be checked with `is_eof` method.
+    pub(crate) fn first(&self) -> char {
+        // `.next()` optimizes better than `.nth(0)`
+        self.chars.clone().next().unwrap_or(EOF_CHAR)
+    }
+
+    /// Checks if there is nothing more to consume.
+    pub(crate) fn is_eof(&self) -> bool {
+        self.chars.as_str().is_empty()
+    }
+
+    /// Returns amount of already consumed symbols.
+    pub(crate) fn pos_within_token(&self) -> u32 {
+        (self.len_remaining - self.chars.as_str().len()) as u32
+    }
+
+    /// Resets the number of bytes consumed to 0.
+    pub(crate) fn reset_pos_within_token(&mut self) {
+        self.len_remaining = self.chars.as_str().len();
+    }
+
+    /// Moves to the next character.
+    pub(crate) fn bump(&mut self) -> Option<char> {
+        let c = self.chars.next()?;
+        Some(c)
+    }
+
+    /// Eats symbols while predicate returns true or until the end of file is reached.
+    pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
+        // It was tried making optimized version of this for eg. line comments, but
+        // LLVM can inline all of this and compile it down to fast iteration over bytes.
+        while predicate(self.first()) && !self.is_eof() {
+            self.bump();
+        }
+    }
+}
diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs
new file mode 100644
index 00000000..9ca95112
--- /dev/null
+++ b/crates/squawk_lexer/src/lib.rs
@@ -0,0 +1,735 @@
+mod cursor;
+mod token;
+use cursor::{Cursor, EOF_CHAR};
+pub use token::{Base, LiteralKind, Token, TokenKind};
+
+// via: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L346
+// ident_start		[A-Za-z\200-\377_]
+const fn is_ident_start(c: char) -> bool {
+    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '\u{80}'..='\u{FF}')
+}
+
+// ident_cont		[A-Za-z\200-\377_0-9\$]
+const fn is_ident_cont(c: char) -> bool {
+    matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' | '$' | '\u{80}'..='\u{FF}')
+}
+
+// see:
+// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scansup.c#L107-L128
+// - https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L204-L229
+const fn is_whitespace(c: char) -> bool {
+    matches!(
+        c,
+        ' ' // space
+        | '\t' // tab
+        | '\n' // newline
+        | '\r' // carriage return
+        | '\u{000B}' // vertical tab
+        | '\u{000C}' // form feed
+    )
+}
+
+impl Cursor<'_> {
+    // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
+    pub(crate) fn advance_token(&mut self) -> Token {
+        let first_char = match self.bump() {
+            Some(c) => c,
+            None => return Token::new(TokenKind::Eof, 0),
+        };
+        let token_kind = match first_char {
+            // Slash, comment or block comment.
+            '/' => match self.first() {
+                '*' => self.block_comment(),
+                _ => TokenKind::Slash,
+            },
+            '-' => match self.first() {
+                '-' => self.line_comment(),
+                _ => TokenKind::Minus,
+            },
+
+            // // Whitespace sequence.
+            c if is_whitespace(c) => self.whitespace(),
+
+            // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
+            'u' | 'U' => match self.first() {
+                '&' => {
+                    self.bump();
+                    self.prefixed_string(
+                        |terminated| LiteralKind::UnicodeEscStr { terminated },
+                        true,
+                    )
+                }
+                _ => self.ident_or_unknown_prefix(),
+            },
+
+            // escaped strings
+            'e' | 'E' => {
+                self.prefixed_string(|terminated| LiteralKind::EscStr { terminated }, false)
+            }
+
+            // bit string
+            'b' | 'B' => {
+                self.prefixed_string(|terminated| LiteralKind::BitStr { terminated }, false)
+            }
+
+            // hexadecimal byte string
+            'x' | 'X' => {
+                self.prefixed_string(|terminated| LiteralKind::ByteStr { terminated }, false)
+            }
+
+            // Identifier (this should be checked after other variant that can
+            // start as identifier).
+            c if is_ident_start(c) => self.ident(),
+
+            // Numeric literal.
+            // see: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
+            c @ '0'..='9' => {
+                let literal_kind = self.number(c);
+                TokenKind::Literal { kind: literal_kind }
+            }
+            '.' => match self.first() {
+                '0'..='9' => {
+                    let literal_kind = self.number('.');
+                    TokenKind::Literal { kind: literal_kind }
+                }
+                _ => TokenKind::Dot,
+            },
+            // One-symbol tokens.
+            ';' => TokenKind::Semi,
+            ',' => TokenKind::Comma,
+            '(' => TokenKind::OpenParen,
+            ')' => TokenKind::CloseParen,
+            '[' => TokenKind::OpenBracket,
+            ']' => TokenKind::CloseBracket,
+            '@' => TokenKind::At,
+            '#' => TokenKind::Pound,
+            '~' => TokenKind::Tilde,
+            '?' => TokenKind::Question,
+            ':' => TokenKind::Colon,
+            '$' => {
+                // Dollar quoted strings
+                if is_ident_start(self.first()) || self.first() == '$' {
+                    self.dollar_quoted_string()
+                } else {
+                    // Parameters
+                    while self.first().is_ascii_digit() {
+                        self.bump();
+                    }
+                    TokenKind::Param
+                }
+            }
+            '`' => TokenKind::Backtick,
+            '=' => TokenKind::Eq,
+            '!' => TokenKind::Bang,
+            '<' => TokenKind::Lt,
+            '>' => TokenKind::Gt,
+            '&' => TokenKind::And,
+            '|' => TokenKind::Or,
+            '+' => TokenKind::Plus,
+            '*' => TokenKind::Star,
+            '^' => TokenKind::Caret,
+            '%' => TokenKind::Percent,
+
+            // String literal
+            '\'' => {
+                let terminated = self.single_quoted_string();
+                let kind = LiteralKind::Str { terminated };
+                TokenKind::Literal { kind }
+            }
+
+            // Quoted indentifiers
+            '"' => {
+                let terminated = self.double_quoted_string();
+                TokenKind::QuotedIdent { terminated }
+            }
+            _ => TokenKind::Unknown,
+        };
+        let res = Token::new(token_kind, self.pos_within_token());
+        self.reset_pos_within_token();
+        res
+    }
+    pub(crate) fn ident(&mut self) -> TokenKind {
+        self.eat_while(is_ident_cont);
+        TokenKind::Ident
+    }
+
+    pub(crate) fn whitespace(&mut self) -> TokenKind {
+        self.eat_while(is_whitespace);
+        TokenKind::Whitespace
+    }
+
+    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(is_ident_cont);
+        // Known prefixes must have been handled earlier. So if
+        // we see a prefix here, it is definitely an unknown prefix.
+        match self.first() {
+            '#' | '"' | '\'' => TokenKind::UnknownPrefix,
+            _ => TokenKind::Ident,
+        }
+    }
+
+    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L227
+    // comment			("--"{non_newline}*)
+    pub(crate) fn line_comment(&mut self) -> TokenKind {
+        self.bump();
+
+        self.eat_while(|c| c != '\n');
+        TokenKind::LineComment
+    }
+
+    // see: https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L324-L344
+    pub(crate) fn block_comment(&mut self) -> TokenKind {
+        self.bump();
+
+        let mut depth = 1usize;
+        while let Some(c) = self.bump() {
+            match c {
+                '/' if self.first() == '*' => {
+                    self.bump();
+                    depth += 1;
+                }
+                '*' if self.first() == '/' => {
+                    self.bump();
+                    depth -= 1;
+                    if depth == 0 {
+                        // This block comment is closed, so for a construction like "/* */ */"
+                        // there will be a successfully parsed block comment "/* */"
+                        // and " */" will be processed separately.
+                        break;
+                    }
+                }
+                _ => (),
+            }
+        }
+
+        TokenKind::BlockComment {
+            terminated: depth == 0,
+        }
+    }
+
+    fn prefixed_string(
+        &mut self,
+        mk_kind: fn(bool) -> LiteralKind,
+        allows_double: bool,
+    ) -> TokenKind {
+        match self.first() {
+            '\'' => {
+                self.bump();
+                let terminated = self.single_quoted_string();
+                let kind = mk_kind(terminated);
+                TokenKind::Literal { kind }
+            }
+            '"' if allows_double => {
+                self.bump();
+                let terminated = self.double_quoted_string();
+                let kind = mk_kind(terminated);
+                TokenKind::Literal { kind }
+            }
+            _ => self.ident_or_unknown_prefix(),
+        }
+    }
+
+    fn number(&mut self, first_digit: char) -> LiteralKind {
+        let mut base = Base::Decimal;
+        if first_digit == '0' {
+            // Attempt to parse encoding base.
+            match self.first() {
+                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L403
+                'b' | 'B' => {
+                    base = Base::Binary;
+                    self.bump();
+                    if !self.eat_decimal_digits() {
+                        return LiteralKind::Int {
+                            base,
+                            empty_int: true,
+                        };
+                    }
+                }
+                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L402
+                'o' | 'O' => {
+                    base = Base::Octal;
+                    self.bump();
+                    if !self.eat_decimal_digits() {
+                        return LiteralKind::Int {
+                            base,
+                            empty_int: true,
+                        };
+                    }
+                }
+                // https://github.com/postgres/postgres/blob/db0c96cc18aec417101e37e59fcc53d4bf647915/src/backend/parser/scan.l#L401
+                'x' | 'X' => {
+                    base = Base::Hexadecimal;
+                    self.bump();
+                    if !self.eat_hexadecimal_digits() {
+                        return LiteralKind::Int {
+                            base,
+                            empty_int: true,
+                        };
+                    }
+                }
+                // Not a base prefix; consume additional digits.
+                '0'..='9' | '_' => {
+                    self.eat_decimal_digits();
+                }
+
+                // Also not a base prefix; nothing more to do here.
+                '.' | 'e' | 'E' => {}
+
+                // Just a 0.
+                _ => {
+                    return LiteralKind::Int {
+                        base,
+                        empty_int: false,
+                    }
+                }
+            }
+        } else {
+            // No base prefix, parse number in the usual way.
+            self.eat_decimal_digits();
+        };
+
+        match self.first() {
+            '.' => {
+                // might have stuff after the ., and if it does, it needs to start
+                // with a number
+                self.bump();
+                let mut empty_exponent = false;
+                if self.first().is_ascii_digit() {
+                    self.eat_decimal_digits();
+                    match self.first() {
+                        'e' | 'E' => {
+                            self.bump();
+                            empty_exponent = !self.eat_float_exponent();
+                        }
+                        _ => (),
+                    }
+                }
+                LiteralKind::Float {
+                    base,
+                    empty_exponent,
+                }
+            }
+            'e' | 'E' => {
+                self.bump();
+                let empty_exponent = !self.eat_float_exponent();
+                LiteralKind::Float {
+                    base,
+                    empty_exponent,
+                }
+            }
+            _ => LiteralKind::Int {
+                base,
+                empty_int: false,
+            },
+        }
+    }
+
+    fn single_quoted_string(&mut self) -> bool {
+        // Parse until either quotes are terminated or error is detected.
+        loop {
+            match self.first() {
+                // Quotes might be terminated.
+                '\'' => {
+                    self.bump();
+
+                    match self.first() {
+                        // encountered an escaped quote ''
+                        '\'' => {
+                            self.bump();
+                        }
+                        // encountered terminating quote
+                        _ => return true,
+                    }
+                }
+                // End of file, stop parsing.
+                EOF_CHAR if self.is_eof() => break,
+                // Skip the character.
+                _ => {
+                    self.bump();
+                }
+            }
+        }
+        // String was not terminated.
+        false
+    }
+
+    /// Eats double-quoted string and returns true
+    /// if string is terminated.
+    fn double_quoted_string(&mut self) -> bool {
+        while let Some(c) = self.bump() {
+            match c {
+                '"' => {
+                    return true;
+                }
+                '\\' if self.first() == '\\' || self.first() == '"' => {
+                    // Bump again to skip escaped character.
+                    self.bump();
+                }
+                _ => (),
+            }
+        }
+        // End of file reached.
+        false
+    }
+
+    // https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
+    fn dollar_quoted_string(&mut self) -> TokenKind {
+        // Get the start sequence of the dollar quote, i.e., 'foo' in
+        // $foo$hello$foo$
+        let mut start = vec![];
+        while let Some(c) = self.bump() {
+            match c {
+                '$' => {
+                    self.bump();
+                    break;
+                }
+                _ => {
+                    start.push(c);
+                }
+            }
+        }
+
+        if start.is_empty() {
+            loop {
+                self.eat_while(|c| c != '$');
+                if self.is_eof() {
+                    return TokenKind::Literal {
+                        kind: LiteralKind::DollarQuotedString { terminated: false },
+                    };
+                }
+                // eat $
+                self.bump();
+                if self.first() == '$' {
+                    self.bump();
+                    return TokenKind::Literal {
+                        kind: LiteralKind::DollarQuotedString { terminated: true },
+                    };
+                }
+            }
+        } else {
+            loop {
+                self.eat_while(|c| c != start[0]);
+                if self.is_eof() {
+                    return TokenKind::Literal {
+                        kind: LiteralKind::DollarQuotedString { terminated: false },
+                    };
+                }
+
+                // might be the start of our start/end sequence
+                let mut match_count = 0;
+                for start_char in start.iter() {
+                    if self.first() == *start_char {
+                        self.bump();
+                        match_count += 1;
+                    } else {
+                        self.bump();
+                        break;
+                    }
+                }
+
+                // closing '$'
+                if self.first() == '$' {
+                    self.bump();
+                    let terminated = match_count == start.len();
+                    return TokenKind::Literal {
+                        kind: LiteralKind::DollarQuotedString { terminated },
+                    };
+                }
+            }
+        }
+    }
+
+    fn eat_decimal_digits(&mut self) -> bool {
+        let mut has_digits = false;
+        loop {
+            match self.first() {
+                '_' => {
+                    self.bump();
+                }
+                '0'..='9' => {
+                    has_digits = true;
+                    self.bump();
+                }
+                _ => break,
+            }
+        }
+        has_digits
+    }
+
+    fn eat_hexadecimal_digits(&mut self) -> bool {
+        let mut has_digits = false;
+        loop {
+            match self.first() {
+                '_' => {
+                    self.bump();
+                }
+                '0'..='9' | 'a'..='f' | 'A'..='F' => {
+                    has_digits = true;
+                    self.bump();
+                }
+                _ => break,
+            }
+        }
+        has_digits
+    }
+
+    /// Eats the float exponent. Returns true if at least one digit was met,
+    /// and returns false otherwise.
+    fn eat_float_exponent(&mut self) -> bool {
+        if self.first() == '-' || self.first() == '+' {
+            self.bump();
+        }
+        self.eat_decimal_digits()
+    }
+}
+
+/// Creates an iterator that produces tokens from the input string.
+pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
+    let mut cursor = Cursor::new(input);
+    std::iter::from_fn(move || {
+        let token = cursor.advance_token();
+        if token.kind != TokenKind::Eof {
+            Some(token)
+        } else {
+            None
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fmt;
+
+    use super::*;
+    use insta::assert_debug_snapshot;
+
+    struct TokenDebug<'a> {
+        content: &'a str,
+        token: Token,
+    }
+    impl fmt::Debug for TokenDebug<'_> {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            write!(f, "{:?} @ {:?}", self.content, self.token.kind)
+        }
+    }
+
+    impl<'a> TokenDebug<'a> {
+        fn new(token: Token, input: &'a str, start: u32) -> TokenDebug<'a> {
+            TokenDebug {
+                token,
+                content: &input[start as usize..(start + token.len) as usize],
+            }
+        }
+    }
+
+    fn lex(input: &str) -> Vec<TokenDebug> {
+        let mut tokens = vec![];
+        let mut start = 0;
+
+        for token in tokenize(input) {
+            let length = token.len;
+            tokens.push(TokenDebug::new(token, input, start));
+            start += length;
+        }
+        tokens
+    }
+    #[test]
+    fn lex_statement() {
+        let result = lex("select 1;");
+        assert_debug_snapshot!(result);
+    }
+
+    #[test]
+    fn block_comment() {
+        let result = lex(r#"
+/*
+ * foo
+ * bar
+*/"#);
+        assert_debug_snapshot!(result);
+    }
+
+    #[test]
+    fn block_comment_unterminated() {
+        let result = lex(r#"
+/*
+ * foo
+ * bar
+ /*
+*/"#);
+        assert_debug_snapshot!(result);
+    }
+
+    #[test]
+    fn line_comment() {
+        let result = lex(r#"
+-- foooooooooooo bar buzz
+"#);
+        assert_debug_snapshot!(result);
+    }
+
+    #[test]
+    fn line_comment_whitespace() {
+        assert_debug_snapshot!(lex(r#"
+select 'Hello' -- This is a comment
+' World';"#))
+    }
+
+    #[test]
+    fn dollar_quoting() {
+        assert_debug_snapshot!(lex(r#"
+$$Dianne's horse$$
+$SomeTag$Dianne's horse$SomeTag$
+
+-- with dollar inside and matching tags
+$foo$hello$world$bar$
+"#))
+    }
+
+    #[test]
+    fn dollar_quote_mismatch_tags_simple() {
+        assert_debug_snapshot!(lex(r#"
+-- dollar quoting with mismatched tags
+$foo$hello world$bar$
+"#));
+    }
+
+    #[test]
+    fn dollar_quote_mismatch_tags_complex() {
+        assert_debug_snapshot!(lex(r#"
+-- with dollar inside but mismatched tags
+$foo$hello$world$bar$
+"#));
+    }
+
+    #[test]
+    fn numeric() {
+        assert_debug_snapshot!(lex(r#"
+42
+3.5
+4.
+.001
+.123e10
+5e2
+1.925e-3
+1e-10
+1e+10
+1e10
+"#))
+    }
+
+    #[test]
+    fn numeric_non_decimal() {
+        assert_debug_snapshot!(lex(r#"
+0b100101
+0B10011001
+0o273
+0O755
+0x42f
+0XFFFF
+"#))
+    }
+
+    #[test]
+    fn numeric_with_seperators() {
+        assert_debug_snapshot!(lex(r#"
+1_500_000_000
+0b10001000_00000000
+0o_1_755
+0xFFFF_FFFF
+1.618_034
+"#))
+    }
+
+    #[test]
+    fn select_with_period() {
+        assert_debug_snapshot!(lex(r#"
+select public.users;
+"#))
+    }
+
+    #[test]
+    fn bitstring() {
+        assert_debug_snapshot!(lex(r#"
+B'1001'
+b'1001'
+X'1FF'
+x'1FF'
+"#))
+    }
+
+    #[test]
+    fn string() {
+        assert_debug_snapshot!(lex(r#"
+'Dianne''s horse'
+
+select 'foo ''
+bar';
+
+select 'foooo'   
+   'bar';
+
+
+'foo \\ \n \tbar'
+
+'forgot to close the string
+"#))
+    }
+
+    #[test]
+    fn params() {
+        assert_debug_snapshot!(lex(r#"
+select $1 + $2;
+
+select $1123123123123;
+
+select $;
+"#))
+    }
+
+    #[test]
+    fn string_with_escapes() {
+        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-ESCAPE
+
+        assert_debug_snapshot!(lex(r#"
+E'foo'
+
+e'bar'
+
+e'\b\f\n\r\t'
+
+e'\0\11\777'
+
+e'\x0\x11\xFF'
+
+e'\uAAAA \UFFFFFFFF'
+
+"#))
+    }
+
+    #[test]
+    fn string_unicode_escape() {
+        // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
+
+        assert_debug_snapshot!(lex(r#"
+U&"d\0061t\+000061"
+
+U&"\0441\043B\043E\043D"
+
+u&'\0441\043B'
+
+U&"d!0061t!+000061" UESCAPE '!'
+"#))
+    }
+
+    #[test]
+    fn quoted_ident() {
+        assert_debug_snapshot!(lex(r#"
+"hello &1 -world";
+
+
+"hello-world
+"#))
+    }
+}
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap
new file mode 100644
index 00000000..9399cff9
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__bitstring.snap
@@ -0,0 +1,15 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nB'1001'\nb'1001'\nX'1FF'\nx'1FF'\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "B'1001'" @ Literal { kind: BitStr { terminated: true } },
+    "\n" @ Whitespace,
+    "b'1001'" @ Literal { kind: BitStr { terminated: true } },
+    "\n" @ Whitespace,
+    "X'1FF'" @ Literal { kind: ByteStr { terminated: true } },
+    "\n" @ Whitespace,
+    "x'1FF'" @ Literal { kind: ByteStr { terminated: true } },
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap
new file mode 100644
index 00000000..f7bfe460
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment.snap
@@ -0,0 +1,8 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: result
+---
+[
+    "\n" @ Whitespace,
+    "/*\n * foo\n * bar\n*/" @ BlockComment { terminated: true },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap
new file mode 100644
index 00000000..2acb3e33
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__block_comment_unterminated.snap
@@ -0,0 +1,8 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: result
+---
+[
+    "\n" @ Whitespace,
+    "/*\n * foo\n * bar\n /*\n*/" @ BlockComment { terminated: false },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap
new file mode 100644
index 00000000..914f4770
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_complex.snap
@@ -0,0 +1,10 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n-- with dollar inside but mismatched tags\n$foo$hello$world$bar$\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "-- with dollar inside but mismatched tags" @ LineComment,
+    "\n" @ Whitespace,
+    "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap
new file mode 100644
index 00000000..1035dd00
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quote_mismatch_tags_simple.snap
@@ -0,0 +1,10 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n-- dollar quoting with mismatched tags\n$foo$hello world$bar$\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "-- dollar quoting with mismatched tags" @ LineComment,
+    "\n" @ Whitespace,
+    "$foo$hello world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap
new file mode 100644
index 00000000..b1d86251
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__dollar_quoting.snap
@@ -0,0 +1,14 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n$$Dianne's horse$$\n$SomeTag$Dianne's horse$SomeTag$\n\n-- with dollar inside and matching tags\n$foo$hello$world$bar$\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "$$Dianne's horse$$" @ Literal { kind: DollarQuotedString { terminated: true } },
+    "\n" @ Whitespace,
+    "$SomeTag$Dianne's horse$SomeTag$" @ Literal { kind: DollarQuotedString { terminated: true } },
+    "\n\n" @ Whitespace,
+    "-- with dollar inside and matching tags" @ LineComment,
+    "\n" @ Whitespace,
+    "$foo$hello$world$bar$\n" @ Literal { kind: DollarQuotedString { terminated: false } },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap
new file mode 100644
index 00000000..a76c42ba
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__lex_statement.snap
@@ -0,0 +1,10 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: result
+---
+[
+    "select" @ Ident,
+    " " @ Whitespace,
+    "1" @ Literal { kind: Int { base: Decimal, empty_int: false } },
+    ";" @ Semi,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap
new file mode 100644
index 00000000..e58ef5ff
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment.snap
@@ -0,0 +1,9 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: result
+---
+[
+    "\n" @ Whitespace,
+    "-- foooooooooooo bar buzz" @ LineComment,
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap
new file mode 100644
index 00000000..fef03086
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__line_comment_whitespace.snap
@@ -0,0 +1,15 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nselect 'Hello' -- This is a comment\n' World';\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "'Hello'" @ Literal { kind: Str { terminated: true } },
+    " " @ Whitespace,
+    "-- This is a comment" @ LineComment,
+    "\n" @ Whitespace,
+    "' World'" @ Literal { kind: Str { terminated: true } },
+    ";" @ Semi,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap
new file mode 100644
index 00000000..c831bd25
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric.snap
@@ -0,0 +1,27 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n42\n3.5\n4.\n.001\n.123e10\n5e2\n1.925e-3\n1e-10\n1e+10\n1e10\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "42" @ Literal { kind: Int { base: Decimal, empty_int: false } },
+    "\n" @ Whitespace,
+    "3.5" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "4." @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    ".001" @ Literal { kind: Int { base: Decimal, empty_int: false } },
+    "\n" @ Whitespace,
+    ".123e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "5e2" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "1.925e-3" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "1e-10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "1e+10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+    "1e10" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap
new file mode 100644
index 00000000..5050265f
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_non_decimal.snap
@@ -0,0 +1,19 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n0b100101\n0B10011001\n0o273\n0O755\n0x42f\n0XFFFF\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "0b100101" @ Literal { kind: Int { base: Binary, empty_int: false } },
+    "\n" @ Whitespace,
+    "0B10011001" @ Literal { kind: Int { base: Binary, empty_int: false } },
+    "\n" @ Whitespace,
+    "0o273" @ Literal { kind: Int { base: Octal, empty_int: false } },
+    "\n" @ Whitespace,
+    "0O755" @ Literal { kind: Int { base: Octal, empty_int: false } },
+    "\n" @ Whitespace,
+    "0x42f" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } },
+    "\n" @ Whitespace,
+    "0XFFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } },
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap
new file mode 100644
index 00000000..46814c5f
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__numeric_with_seperators.snap
@@ -0,0 +1,17 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n1_500_000_000\n0b10001000_00000000\n0o_1_755\n0xFFFF_FFFF\n1.618_034\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "1_500_000_000" @ Literal { kind: Int { base: Decimal, empty_int: false } },
+    "\n" @ Whitespace,
+    "0b10001000_00000000" @ Literal { kind: Int { base: Binary, empty_int: false } },
+    "\n" @ Whitespace,
+    "0o_1_755" @ Literal { kind: Int { base: Octal, empty_int: false } },
+    "\n" @ Whitespace,
+    "0xFFFF_FFFF" @ Literal { kind: Int { base: Hexadecimal, empty_int: false } },
+    "\n" @ Whitespace,
+    "1.618_034" @ Literal { kind: Float { base: Decimal, empty_exponent: false } },
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap
new file mode 100644
index 00000000..1879b452
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__params.snap
@@ -0,0 +1,26 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nselect $1 + $2;\n\nselect $1123123123123;\n\nselect $;\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "$1" @ Param,
+    " " @ Whitespace,
+    "+" @ Plus,
+    " " @ Whitespace,
+    "$2" @ Param,
+    ";" @ Semi,
+    "\n\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "$1123123123123" @ Param,
+    ";" @ Semi,
+    "\n\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "$" @ Param,
+    ";" @ Semi,
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap
new file mode 100644
index 00000000..70f71342
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__quoted_ident.snap
@@ -0,0 +1,11 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n\"hello &1 -world\";\n\n\n\"hello-world\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "\"hello &1 -world\"" @ QuotedIdent { terminated: true },
+    ";" @ Semi,
+    "\n\n\n" @ Whitespace,
+    "\"hello-world\n" @ QuotedIdent { terminated: false },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap
new file mode 100644
index 00000000..00ef920f
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__select_with_period.snap
@@ -0,0 +1,14 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nselect public.users;\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "public" @ Ident,
+    "." @ Dot,
+    "users" @ Ident,
+    ";" @ Semi,
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap
new file mode 100644
index 00000000..1022c823
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string.snap
@@ -0,0 +1,24 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\n'Dianne''s horse'\n\nselect 'foo ''\nbar';\n\nselect 'foooo'   \n   'bar';\n\n\n'foo \\\\ \\n \\tbar'\n\n'forgot to close the string\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "'Dianne''s horse'" @ Literal { kind: Str { terminated: true } },
+    "\n\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "'foo ''\nbar'" @ Literal { kind: Str { terminated: true } },
+    ";" @ Semi,
+    "\n\n" @ Whitespace,
+    "select" @ Ident,
+    " " @ Whitespace,
+    "'foooo'" @ Literal { kind: Str { terminated: true } },
+    "   \n   " @ Whitespace,
+    "'bar'" @ Literal { kind: Str { terminated: true } },
+    ";" @ Semi,
+    "\n\n\n" @ Whitespace,
+    "'foo \\\\ \\n \\tbar'" @ Literal { kind: Str { terminated: true } },
+    "\n\n" @ Whitespace,
+    "'forgot to close the string\n" @ Literal { kind: Str { terminated: false } },
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap
new file mode 100644
index 00000000..b257b050
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_unicode_escape.snap
@@ -0,0 +1,19 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nU&\"d\\0061t\\+000061\"\n\nU&\"\\0441\\043B\\043E\\043D\"\n\nu&'\\0441\\043B'\n\nU&\"d!0061t!+000061\" UESCAPE '!'\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "U&\"d\\0061t\\+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "U&\"\\0441\\043B\\043E\\043D\"" @ Literal { kind: UnicodeEscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "u&'\\0441\\043B'" @ Literal { kind: UnicodeEscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "U&\"d!0061t!+000061\"" @ Literal { kind: UnicodeEscStr { terminated: true } },
+    " " @ Whitespace,
+    "UESCAPE" @ Ident,
+    " " @ Whitespace,
+    "'!'" @ Literal { kind: Str { terminated: true } },
+    "\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap
new file mode 100644
index 00000000..67947733
--- /dev/null
+++ b/crates/squawk_lexer/src/snapshots/lexer__tests__string_with_escapes.snap
@@ -0,0 +1,19 @@
+---
+source: crates/squawk_lexer/src/lib.rs
+expression: "lex(r#\"\nE'foo'\n\ne'bar'\n\ne'\\b\\f\\n\\r\\t'\n\ne'\\0\\11\\777'\n\ne'\\x0\\x11\\xFF'\n\ne'\\uAAAA \\UFFFFFFFF'\n\n\"#)"
+---
+[
+    "\n" @ Whitespace,
+    "E'foo'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "e'bar'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "e'\\b\\f\\n\\r\\t'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "e'\\0\\11\\777'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "e'\\x0\\x11\\xFF'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+    "e'\\uAAAA \\UFFFFFFFF'" @ Literal { kind: EscStr { terminated: true } },
+    "\n\n" @ Whitespace,
+]
diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs
new file mode 100644
index 00000000..9853f8cc
--- /dev/null
+++ b/crates/squawk_lexer/src/token.rs
@@ -0,0 +1,155 @@
+// based on: https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/lib.rs#L58
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum TokenKind {
+    /// Used when there's an error of some sort while lexing.
+    Unknown,
+    /// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
+    /// suffix, but may be present here on string and float literals. Users of
+    /// this type will need to check for and reject that case.
+    ///
+    /// See [`LiteralKind`] for more details.
+    Literal { kind: LiteralKind },
+    /// Space, tab, newline, carriage return, vertical tab, form feed
+    Whitespace,
+    /// Identifier
+    ///
+    /// case-sensitive
+    Ident,
+    /// `;`
+    Semi,
+    /// End of file
+    Eof,
+    /// `/`
+    Slash,
+    /// `-- foo`
+    LineComment,
+    /// ```
+    /// /*
+    /// foo
+    /// */
+    /// ```
+    BlockComment { terminated: bool },
+    /// `-`
+    Minus,
+    /// `:`
+    Colon,
+    /// `.`
+    Dot,
+    /// `=`
+    Eq,
+    /// `>`
+    Gt,
+    /// `&`
+    And,
+    /// `<`
+    Lt,
+    /// `!`
+    Bang,
+    /// `+`
+    Plus,
+    /// `~`
+    Tilde,
+    /// `#`
+    Pound,
+    /// `?`
+    Question,
+    /// `|`
+    Or,
+    /// `%`
+    Percent,
+    /// `^`
+    Caret,
+    /// `*`
+    Star,
+    /// `` ` ``
+    Backtick,
+    /// `@`
+    At,
+    /// `]`
+    CloseBracket,
+    /// `[`
+    OpenBracket,
+    /// `)`
+    CloseParen,
+    /// `(`
+    OpenParen,
+    /// `,`
+    Comma,
+    /// Error case that we need to report later on.
+    UnknownPrefix,
+    /// Positional Parameter, e.g., `$1`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-expressions.html#SQL-EXPRESSIONS-PARAMETERS-POSITIONAL
+    Param,
+    /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;`
+    ///
+    /// These are case-sensitive, unlike [`TokenKind::Ident`]
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
+    QuotedIdent { terminated: bool },
+}
+
+/// Parsed token.
+/// It doesn't contain information about data that has been parsed,
+/// only the type of the token and its size.
+#[derive(Debug, Clone, Copy)]
+pub struct Token {
+    pub kind: TokenKind,
+    pub len: u32,
+}
+
+impl Token {
+    pub(crate) fn new(kind: TokenKind, len: u32) -> Token {
+        Token { kind, len }
+    }
+}
+
+/// Base of numeric literal encoding according to its prefix.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Base {
+    /// Literal starts with "0b".
+    Binary = 2,
+    /// Literal starts with "0o".
+    Octal = 8,
+    /// Literal doesn't contain a prefix.
+    Decimal = 10,
+    /// Literal starts with "0x".
+    Hexadecimal = 16,
+}
+
+// Enum representing the literal types supported by the lexer.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LiteralKind {
+    /// Integer Numeric, e.g., `42`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
+    Int { base: Base, empty_int: bool },
+    /// Float Numeric, e.g., `1.925e-3`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
+    Float { base: Base, empty_exponent: bool },
+    /// String, e.g., `'foo'`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS
+    Str { terminated: bool },
+    /// Hexidecimal Bit String, e.g., `X'1FF'`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS
+    ByteStr { terminated: bool },
+    /// Bit String, e.g., `B'1001'`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS
+    BitStr { terminated: bool },
+    /// Dollar Quoted String, e.g., `$$Dianne's horse$$`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
+    DollarQuotedString { terminated: bool },
+    /// Unicode Escape String, e.g., `U&'d\0061t\+000061'`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
+    UnicodeEscStr { terminated: bool },
+    /// Escape String, e.g, `E'foo'`
+    ///
+    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html
+    EscStr { terminated: bool },
+}

From 1e1efb7827f7895e51e6ffcf2f90f4aa4c3b726d Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 16:45:20 -0400
Subject: [PATCH 2/6] fix

---
 Cargo.toml                     | 4 ++++
 crates/linter/Cargo.toml       | 6 +++---
 crates/squawk_lexer/Cargo.toml | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bfa13eb7..2e90fad1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,9 @@
 [workspace]
 members = ["crates/*"]
+authors = ["Squawk Team & Contributors"]
+edition = "2021"
+license = "GPL-3.0"
+rust-version = "1.81.0"
 
 [workspace.dependencies]
 # third party
diff --git a/crates/linter/Cargo.toml b/crates/linter/Cargo.toml
index 0d6eb7c6..a0cbde71 100644
--- a/crates/linter/Cargo.toml
+++ b/crates/linter/Cargo.toml
@@ -1,9 +1,9 @@
 [package]
 name = "squawk-linter"
 version = "0.0.0"
-authors = ["Steve Dignam <steve@dignam.xyz>"]
-edition = "2018"
-license = "GPL-3.0"
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
 description = "Postgres SQL linter used in squawk"
 repository = "https://github.com/sbdchd/squawk"
 readme = "README.md"
diff --git a/crates/squawk_lexer/Cargo.toml b/crates/squawk_lexer/Cargo.toml
index 11c75156..699c9077 100644
--- a/crates/squawk_lexer/Cargo.toml
+++ b/crates/squawk_lexer/Cargo.toml
@@ -5,7 +5,7 @@ description = "TBD"
 
 authors.workspace = true
 edition.workspace = true
-license.workspace = true
+license = "MIT"
 rust-version.workspace = true
 
 [lib]

From c4233a8acd1aaa4bb556bb601fe9dadb54f7ce83 Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 16:50:06 -0400
Subject: [PATCH 3/6] fix

---
 Cargo.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index 2e90fad1..e44a1c43 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,7 @@
 [workspace]
 members = ["crates/*"]
+
+[workspace.package]
 authors = ["Squawk Team & Contributors"]
 edition = "2021"
 license = "GPL-3.0"

From a3c47078462153c56d37614858cfa1ecfb5ae3b9 Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 16:53:01 -0400
Subject: [PATCH 4/6] fix

---
 Cargo.lock |  7 +++++++
 Cargo.toml | 13 +++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2c00ea6..69e41799 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -749,6 +749,13 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
+[[package]]
+name = "lexer"
+version = "0.0.0"
+dependencies = [
+ "insta",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.167"
diff --git a/Cargo.toml b/Cargo.toml
index e44a1c43..31700045 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,11 +1,12 @@
 [workspace]
 members = ["crates/*"]
+resolver = "2"
 
 [workspace.package]
-authors = ["Squawk Team & Contributors"]
 edition = "2021"
-license = "GPL-3.0"
 rust-version = "1.81.0"
+authors = ["Squawk Team & Contributors"]
+license = "GPL-3.0"
 
 [workspace.dependencies]
 # third party
@@ -33,5 +34,13 @@ squawk-parser = { version = "0.0.0", path = "./crates/parser" }
 squawk-linter = { version = "0.0.0", path = "./crates/linter" }
 squawk-github = { version = "0.0.0", path = "./crates/github" }
 
+[workspace.lints.clippy]
+collapsible_else_if = "allow"
+collapsible_if = "allow"
+needless_return = "allow"
+
+[profile.dev]
+debug = 0
+
 [profile.dev.package]
 insta.opt-level = 3

From 771c36c03fb706a7a8e2e97b6680c6463ab22ec9 Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 17:00:10 -0400
Subject: [PATCH 5/6] fix

---
 Cargo.toml                        |  4 ++++
 crates/squawk_lexer/src/cursor.rs |  4 ++--
 crates/squawk_lexer/src/lib.rs    |  7 +++----
 crates/squawk_lexer/src/token.rs  | 20 ++++++++++----------
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 31700045..49cd1eb8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,6 +38,10 @@ squawk-github = { version = "0.0.0", path = "./crates/github" }
 collapsible_else_if = "allow"
 collapsible_if = "allow"
 needless_return = "allow"
+if_not_else = "allow"
+needless_raw_string_hashes = "allow"
+cast_possible_truncation = "allow"
+semicolon_if_nothing_returned = "allow"
 
 [profile.dev]
 debug = 0
diff --git a/crates/squawk_lexer/src/cursor.rs b/crates/squawk_lexer/src/cursor.rs
index dad5d9e5..c70c5feb 100644
--- a/crates/squawk_lexer/src/cursor.rs
+++ b/crates/squawk_lexer/src/cursor.rs
@@ -5,8 +5,8 @@ use std::str::Chars;
 /// Next characters can be peeked via `first` method,
 /// and position can be shifted forward via `bump` method.
 /// based on:
-/// - https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/cursor.rs
-/// - https://github.com/astral-sh/ruff/blob/d1079680bb29f6b797b5df15327195300f635f3c/crates/ruff_python_parser/src/lexer/cursor.rs
+/// - <https://github.com/rust-lang/rust/blob/d1b7355d3d7b4ead564dbecb1d240fcc74fff21b/compiler/rustc_lexer/src/cursor.rs>
+/// - <https://github.com/astral-sh/ruff/blob/d1079680bb29f6b797b5df15327195300f635f3c/crates/ruff_python_parser/src/lexer/cursor.rs>
 ///
 pub(crate) struct Cursor<'a> {
     /// Iterator over chars. Slightly faster than a &str.
diff --git a/crates/squawk_lexer/src/lib.rs b/crates/squawk_lexer/src/lib.rs
index 9ca95112..dff91c6c 100644
--- a/crates/squawk_lexer/src/lib.rs
+++ b/crates/squawk_lexer/src/lib.rs
@@ -32,9 +32,8 @@ const fn is_whitespace(c: char) -> bool {
 impl Cursor<'_> {
     // see: https://github.com/rust-lang/rust/blob/ba1d7f4a083e6402679105115ded645512a7aea8/compiler/rustc_lexer/src/lib.rs#L339
     pub(crate) fn advance_token(&mut self) -> Token {
-        let first_char = match self.bump() {
-            Some(c) => c,
-            None => return Token::new(TokenKind::Eof, 0),
+        let Some(first_char) = self.bump() else {
+            return Token::new(TokenKind::Eof, 0);
         };
         let token_kind = match first_char {
             // Slash, comment or block comment.
@@ -418,7 +417,7 @@ impl Cursor<'_> {
 
                 // might be the start of our start/end sequence
                 let mut match_count = 0;
-                for start_char in start.iter() {
+                for start_char in &start {
                     if self.first() == *start_char {
                         self.bump();
                         match_count += 1;
diff --git a/crates/squawk_lexer/src/token.rs b/crates/squawk_lexer/src/token.rs
index 9853f8cc..5827c672 100644
--- a/crates/squawk_lexer/src/token.rs
+++ b/crates/squawk_lexer/src/token.rs
@@ -79,13 +79,13 @@ pub enum TokenKind {
     UnknownPrefix,
     /// Positional Parameter, e.g., `$1`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-expressions.html#SQL-EXPRESSIONS-PARAMETERS-POSITIONAL
+    /// see: <https://www.postgresql.org/docs/16/sql-expressions.html#SQL-EXPRESSIONS-PARAMETERS-POSITIONAL>
     Param,
     /// Quoted Identifier, e.g., `"update"` in `update "my_table" set "a" = 5;`
     ///
     /// These are case-sensitive, unlike [`TokenKind::Ident`]
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS>
     QuotedIdent { terminated: bool },
 }
 
@@ -122,34 +122,34 @@ pub enum Base {
 pub enum LiteralKind {
     /// Integer Numeric, e.g., `42`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC>
     Int { base: Base, empty_int: bool },
     /// Float Numeric, e.g., `1.925e-3`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-CONSTANTS-NUMERIC>
     Float { base: Base, empty_exponent: bool },
     /// String, e.g., `'foo'`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS>
     Str { terminated: bool },
     /// Hexidecimal Bit String, e.g., `X'1FF'`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS>
     ByteStr { terminated: bool },
     /// Bit String, e.g., `B'1001'`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-BIT-STRINGS>
     BitStr { terminated: bool },
     /// Dollar Quoted String, e.g., `$$Dianne's horse$$`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING>
     DollarQuotedString { terminated: bool },
     /// Unicode Escape String, e.g., `U&'d\0061t\+000061'`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE>
     UnicodeEscStr { terminated: bool },
     /// Escape String, e.g, `E'foo'`
     ///
-    /// see: https://www.postgresql.org/docs/16/sql-syntax-lexical.html
+    /// see: <https://www.postgresql.org/docs/16/sql-syntax-lexical.html>
     EscStr { terminated: bool },
 }

From 1cbe6b3e70fae9287e082947498a068f57478bc6 Mon Sep 17 00:00:00 2001
From: Steve Dignam <steve@dignam.xyz>
Date: Sat, 3 May 2025 17:18:52 -0400
Subject: [PATCH 6/6] fix

---
 Cargo.toml | 6 ++----
 s/lint     | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 49cd1eb8..5b15e8d3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -38,10 +38,8 @@ squawk-github = { version = "0.0.0", path = "./crates/github" }
 collapsible_else_if = "allow"
 collapsible_if = "allow"
 needless_return = "allow"
-if_not_else = "allow"
-needless_raw_string_hashes = "allow"
-cast_possible_truncation = "allow"
-semicolon_if_nothing_returned = "allow"
+doc_markdown = "deny"
+manual_let_else = "deny"
 
 [profile.dev]
 debug = 0
diff --git a/s/lint b/s/lint
index 22eb9d2b..5782ff3d 100755
--- a/s/lint
+++ b/s/lint
@@ -2,4 +2,4 @@
 set -eu
 
 cargo fmt -- --check
-cargo clippy --all-targets --all-features -- -D clippy::pedantic
+cargo clippy --all-targets --all-features