fix(sqlite): correct StmtLocation/StmtLen for non-ASCII characters in comments

mem · mem · commit fe7673b73ad9 · 2026-02-24T20:17:28.000-06:00
ANTLR4-go stores its input stream as []rune, so all token positions
returned by GetStart().GetStart() and GetStop().GetStop() are rune
indices, not byte offsets. The SQLite parser was storing these values
directly as StmtLocation and StmtLen, which are later consumed by
source.Pluck() using byte-based Go string slicing (source[head:tail]).

For source files that contain multi-byte UTF-8 characters (non-ASCII)
in comments, the rune index diverges from the byte offset, causing the
plucked query text to be truncated. Each 2-byte character (e.g. Ü, é)
caused one byte to be dropped from the end of the query; each 3-byte
character (e.g. ♥) caused two bytes to be dropped; and so on.

Fix this by building a rune-index to byte-offset map from the source
string before processing the ANTLR parse tree, then converting the
ANTLR rune positions to byte offsets before storing them in the AST.
The internal loc tracking variable continues to use rune indices (for
consistency with the ANTLR token positions), while only the values
written into StmtLocation and StmtLen are converted to byte offsets.

Add TestParseNonASCIIComment covering 2-, 3-, and 4-byte characters in
dash comments, multiple non-ASCII characters, and the multi-statement
case where an incorrect loc for one statement would propagate and
corrupt the StmtLocation of the following statement.
diff --git a/internal/engine/sqlite/parse.go b/internal/engine/sqlite/parse.go
@@ -37,12 +37,29 @@ func NewParser() *Parser {
 type Parser struct {
 }
 
+// runeToByteOffsets returns a slice mapping rune index i to the byte offset of
+// the i-th rune in s. A sentinel element equal to len(s) is appended so that
+// element numRunes is valid and equals the total byte length of s.
+//
+// This is needed because ANTLR4 stores the input as []rune and all token
+// positions (GetStart/GetStop) are rune indices, while Go string slicing is
+// byte-based.
+func runeToByteOffsets(s string) []int {
+	offsets := make([]int, 0, len(s)+1)
+	for i := range s {
+		offsets = append(offsets, i)
+	}
+	offsets = append(offsets, len(s))
+	return offsets
+}
+
 func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
 	blob, err := io.ReadAll(r)
 	if err != nil {
 		return nil, err
 	}
-	input := antlr.NewInputStream(string(blob))
+	src := string(blob)
+	input := antlr.NewInputStream(src)
 	lexer := parser.NewSQLiteLexer(input)
 	stream := antlr.NewCommonTokenStream(lexer, 0)
 	pp := parser.NewSQLiteParser(stream)
@@ -57,13 +74,17 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
 	if !ok {
 		return nil, fmt.Errorf("expected ParserContext; got %T\n", tree)
 	}
+	// ANTLR uses rune-based positions. Build a mapping from rune index to byte
+	// offset so we can store byte-based StmtLocation/StmtLen, which is what
+	// source.Pluck and the rest of the pipeline expect.
+	runeByteMap := runeToByteOffsets(src)
 	var stmts []ast.Statement
 	for _, istmt := range pctx.AllSql_stmt_list() {
 		list, ok := istmt.(*parser.Sql_stmt_listContext)
 		if !ok {
 			return nil, fmt.Errorf("expected Sql_stmt_listContext; got %T\n", istmt)
 		}
-		loc := 0
+		loc := 0 // rune offset of the current statement's start
 
 		for _, stmt := range list.AllSql_stmt() {
 			converter := &cc{}
@@ -72,12 +93,14 @@ func (p *Parser) Parse(r io.Reader) ([]ast.Statement, error) {
 				loc = stmt.GetStop().GetStop() + 2
 				continue
 			}
-			len := (stmt.GetStop().GetStop() + 1) - loc
+			runeEnd := stmt.GetStop().GetStop() + 1
+			byteStart := runeByteMap[loc]
+			byteEnd := runeByteMap[runeEnd]
 			stmts = append(stmts, ast.Statement{
 				Raw: &ast.RawStmt{
 					Stmt:         out,
-					StmtLocation: loc,
-					StmtLen:      len,
+					StmtLocation: byteStart,
+					StmtLen:      byteEnd - byteStart,
 				},
 			})
 			loc = stmt.GetStop().GetStop() + 2
diff --git a/internal/engine/sqlite/parse_test.go b/internal/engine/sqlite/parse_test.go
@@ -0,0 +1,97 @@
+package sqlite
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/sqlc-dev/sqlc/internal/source"
+)
+
+// TestParseNonASCIIComment verifies that non-ASCII characters in SQL comments
+// do not corrupt the plucked query text.
+//
+// ANTLR4 stores the input as []rune so all token positions are rune indices,
+// not byte offsets. source.Pluck (and the rest of the pipeline) treats
+// StmtLocation/StmtLen as byte offsets. For multi-byte UTF-8 characters the
+// two differ, which previously caused the plucked query to be truncated by one
+// byte per extra byte in each non-ASCII character.
+func TestParseNonASCIIComment(t *testing.T) {
+	p := NewParser()
+
+	tests := []struct {
+		name string
+		sql  string
+	}{
+		{
+			name: "2-byte char (U+00DC Ü) in dash comment",
+			sql:  "-- name: GetUser :one\n-- Ünïcode comment\nSELECT id FROM users WHERE id = ?",
+		},
+		{
+			name: "3-byte char (U+2665 ♥) in dash comment",
+			sql:  "-- name: GetUser :one\n-- ♥ love\nSELECT id FROM users WHERE id = ?",
+		},
+		{
+			name: "4-byte char (U+1D11E 𝄞) in dash comment",
+			sql:  "-- name: GetUser :one\n-- 𝄞 music\nSELECT id FROM users WHERE id = ?",
+		},
+		{
+			name: "multiple non-ASCII chars in comment",
+			sql:  "-- name: GetUser :one\n-- héllo wörld\nSELECT id FROM users WHERE id = ?",
+		},
+		{
+			name: "non-ASCII only in first of two statements",
+			sql:  "-- name: Q1 :one\n-- Ü\nSELECT 1;\n\n-- name: Q2 :one\nSELECT 2",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			stmts, err := p.Parse(strings.NewReader(tc.sql))
+			if err != nil {
+				t.Fatalf("Parse error: %v", err)
+			}
+			if len(stmts) == 0 {
+				t.Fatal("expected at least one statement")
+			}
+
+			// For every parsed statement, verify that the plucked text is a
+			// valid substring of the original SQL (not truncated mid-character).
+			for i, stmt := range stmts {
+				raw := stmt.Raw
+				plucked, err := source.Pluck(tc.sql, raw.StmtLocation, raw.StmtLen)
+				if err != nil {
+					t.Fatalf("stmt %d: Pluck error: %v", i, err)
+				}
+				if !strings.Contains(tc.sql, plucked) {
+					t.Errorf("stmt %d: plucked text is not a substring of the input\ngot:  %q\ninput: %q", i, plucked, tc.sql)
+				}
+				if plucked == "" {
+					t.Errorf("stmt %d: plucked text is empty", i)
+				}
+			}
+
+			// For the single-statement cases the plucked text must equal the
+			// full input, since there is exactly one statement and no trailing
+			// semicolon to exclude.
+			if len(stmts) == 1 {
+				raw := stmts[0].Raw
+				plucked, _ := source.Pluck(tc.sql, raw.StmtLocation, raw.StmtLen)
+				if plucked != tc.sql {
+					t.Errorf("single-statement pluck mismatch\ngot:  %q\nwant: %q", plucked, tc.sql)
+				}
+			}
+
+			// For the two-statement case, verify each statement contains its
+			// expected SELECT.
+			if len(stmts) == 2 {
+				for i, want := range []string{"SELECT 1", "SELECT 2"} {
+					raw := stmts[i].Raw
+					plucked, _ := source.Pluck(tc.sql, raw.StmtLocation, raw.StmtLen)
+					if !strings.Contains(plucked, want) {
+						t.Errorf("stmt %d: plucked text %q does not contain %q", i, plucked, want)
+					}
+				}
+			}
+		})
+	}
+}