From 170f439bfe9411ad50c417d664eb3817ce6bf5e8 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sat, 24 Jan 2026 20:05:46 -0600 Subject: [PATCH 01/13] feat: add -f codepage flag for input/output encoding - Add -f/--code-page flag with ODBC-compatible format parsing - Support 50+ codepages: Unicode, Windows, OEM/DOS, ISO-8859, CJK, EBCDIC, Macintosh - Apply input codepage in IncludeFile() for :r command - Apply output codepage in outCommand() for :OUT file writes - Add --list-codepages flag to display all supported codepages - Add comprehensive unit tests for parsing and encoding lookup --- README.md | 1 + cmd/sqlcmd/sqlcmd.go | 28 ++++ cmd/sqlcmd/sqlcmd_test.go | 21 +++ pkg/sqlcmd/codepage.go | 318 ++++++++++++++++++++++++++++++++++++ pkg/sqlcmd/codepage_test.go | 228 ++++++++++++++++++++++++++ pkg/sqlcmd/commands.go | 27 ++- pkg/sqlcmd/sqlcmd.go | 25 ++- 7 files changed, 644 insertions(+), 4 deletions(-) create mode 100644 pkg/sqlcmd/codepage.go create mode 100644 pkg/sqlcmd/codepage_test.go diff --git a/README.md b/README.md index fe26e192..5397d79e 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,7 @@ The following switches have different behavior in this version of `sqlcmd` compa - To provide the value of the host name in the server certificate when using strict encryption, pass the host name with `-F`. Example: `-Ns -F myhost.domain.com` - More information about client/server encryption negotiation can be found at - `-u` The generated Unicode output file will have the UTF16 Little-Endian Byte-order mark (BOM) written to it. +- `-f` Specifies the code page for input and output files. Format: `codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage]`. Use `65001` for UTF-8. Supported codepages include Unicode (65001, 1200, 1201), Windows (874, 1250-1258), OEM/DOS (437, 850, etc.), ISO-8859 (28591-28606), CJK (932, 936, 949, 950), and EBCDIC (37, 1047, 1140). Use `--list-codepages` to see all supported code pages. - Some behaviors that were kept to maintain compatibility with `OSQL` may be changed, such as alignment of column headers for some data types. - All commands must fit on one line, even `EXIT`. Interactive mode will not check for open parentheses or quotes for commands and prompt for successive lines. The ODBC sqlcmd allows the query run by `EXIT(query)` to span multiple lines. - `-i` doesn't handle a comma `,` in a file name correctly unless the file name argument is triple quoted. For example: diff --git a/cmd/sqlcmd/sqlcmd.go b/cmd/sqlcmd/sqlcmd.go index ea655b47..4fad0232 100644 --- a/cmd/sqlcmd/sqlcmd.go +++ b/cmd/sqlcmd/sqlcmd.go @@ -82,6 +82,8 @@ type SQLCmdArguments struct { ChangePassword string ChangePasswordAndExit string TraceFile string + CodePage string + ListCodePages bool // Keep Help at the end of the list Help bool } @@ -171,6 +173,10 @@ func (a *SQLCmdArguments) Validate(c *cobra.Command) (err error) { err = rangeParameterError("-t", fmt.Sprint(a.QueryTimeout), 0, 65534, true) case a.ServerCertificate != "" && !encryptConnectionAllowsTLS(a.EncryptConnection): err = localizer.Errorf("The -J parameter requires encryption to be enabled (-N true, -N mandatory, or -N strict).") + case a.CodePage != "": + if _, parseErr := sqlcmd.ParseCodePage(a.CodePage); parseErr != nil { + err = localizer.Errorf(`'-f %s': %v`, a.CodePage, parseErr) + } } } if err != nil { @@ -239,6 +245,17 @@ func Execute(version string) { listLocalServers() os.Exit(0) } + // List supported codepages + if args.ListCodePages { + fmt.Println(localizer.Sprintf("Supported Code Pages:")) + fmt.Println() + fmt.Printf("%-8s %-20s %s\n", "Code", "Name", "Description") + fmt.Printf("%-8s %-20s %s\n", "----", "----", "-----------") + for _, cp := range sqlcmd.SupportedCodePages() { + fmt.Printf("%-8d %-20s %s\n", cp.CodePage, cp.Name, cp.Description) + } + os.Exit(0) + } if len(argss) > 0 { fmt.Printf("%s'%s': Unknown command. Enter '--help' for command help.", sqlcmdErrorPrefix, argss[0]) os.Exit(1) @@ -479,6 +496,8 @@ func setFlags(rootCmd *cobra.Command, args *SQLCmdArguments) { rootCmd.Flags().BoolVarP(&args.EnableColumnEncryption, "enable-column-encryption", "g", false, localizer.Sprintf("Enable column encryption")) rootCmd.Flags().StringVarP(&args.ChangePassword, "change-password", "z", "", localizer.Sprintf("New password")) rootCmd.Flags().StringVarP(&args.ChangePasswordAndExit, "change-password-exit", "Z", "", localizer.Sprintf("New password and exit")) + rootCmd.Flags().StringVarP(&args.CodePage, "code-page", "f", "", localizer.Sprintf("Specifies the code page for input/output. Use 65001 for UTF-8. Format: codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage]")) + rootCmd.Flags().BoolVar(&args.ListCodePages, "list-codepages", false, localizer.Sprintf("List supported code pages and exit")) } func setScriptVariable(v string) string { @@ -813,6 +832,15 @@ func run(vars *sqlcmd.Variables, args *SQLCmdArguments) (int, error) { defer s.StopCloseHandler() s.UnicodeOutputFile = args.UnicodeOutputFile + // Parse and apply codepage settings + if args.CodePage != "" { + codePageSettings, err := sqlcmd.ParseCodePage(args.CodePage) + if err != nil { + return 1, localizer.Errorf("Invalid code page: %v", err) + } + s.CodePage = codePageSettings + } + if args.DisableCmd != nil { s.Cmd.DisableSysCommands(args.errorOnBlockedCmd()) } diff --git a/cmd/sqlcmd/sqlcmd_test.go b/cmd/sqlcmd/sqlcmd_test.go index 511816b2..cfdbcf31 100644 --- a/cmd/sqlcmd/sqlcmd_test.go +++ b/cmd/sqlcmd/sqlcmd_test.go @@ -123,6 +123,22 @@ func TestValidCommandLineToArgsConversion(t *testing.T) { {[]string{"-N", "true", "-J", "/path/to/cert2.pem"}, func(args SQLCmdArguments) bool { return args.EncryptConnection == "true" && args.ServerCertificate == "/path/to/cert2.pem" }}, + // Codepage flag tests + {[]string{"-f", "65001"}, func(args SQLCmdArguments) bool { + return args.CodePage == "65001" + }}, + {[]string{"-f", "i:1252,o:65001"}, func(args SQLCmdArguments) bool { + return args.CodePage == "i:1252,o:65001" + }}, + {[]string{"-f", "o:65001,i:1252"}, func(args SQLCmdArguments) bool { + return args.CodePage == "o:65001,i:1252" + }}, + {[]string{"--code-page", "1252"}, func(args SQLCmdArguments) bool { + return args.CodePage == "1252" + }}, + {[]string{"--list-codepages"}, func(args SQLCmdArguments) bool { + return args.ListCodePages + }}, } for _, test := range commands { @@ -178,6 +194,11 @@ func TestInvalidCommandLine(t *testing.T) { {[]string{"-N", "optional", "-J", "/path/to/cert.pem"}, "The -J parameter requires encryption to be enabled (-N true, -N mandatory, or -N strict)."}, {[]string{"-N", "disable", "-J", "/path/to/cert.pem"}, "The -J parameter requires encryption to be enabled (-N true, -N mandatory, or -N strict)."}, {[]string{"-N", "strict", "-F", "myserver.domain.com", "-J", "/path/to/cert.pem"}, "The -F and the -J options are mutually exclusive."}, + // Codepage validation tests + {[]string{"-f", "invalid"}, `'-f invalid': invalid codepage: invalid`}, + {[]string{"-f", "99999"}, `'-f 99999': unsupported codepage 99999`}, + {[]string{"-f", "i:invalid"}, `'-f i:invalid': invalid input codepage: i:invalid`}, + {[]string{"-f", "x:1252"}, `'-f x:1252': invalid codepage: x:1252`}, } for _, test := range commands { diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go new file mode 100644 index 00000000..231940cb --- /dev/null +++ b/pkg/sqlcmd/codepage.go @@ -0,0 +1,318 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package sqlcmd + +import ( + "fmt" + "strconv" + "strings" + + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/charmap" + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/encoding/korean" + "golang.org/x/text/encoding/simplifiedchinese" + "golang.org/x/text/encoding/traditionalchinese" + "golang.org/x/text/encoding/unicode" +) + +// CodePageSettings holds the input and output codepage settings +type CodePageSettings struct { + InputCodePage int + OutputCodePage int +} + +// ParseCodePage parses the -f codepage argument +// Format: codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage] +func ParseCodePage(arg string) (*CodePageSettings, error) { + if arg == "" { + return nil, nil + } + + settings := &CodePageSettings{} + parts := strings.Split(arg, ",") + + for _, part := range parts { + part = strings.TrimSpace(part) + if part == "" { + continue + } + + if strings.HasPrefix(strings.ToLower(part), "i:") { + // Input codepage + cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "i:")) + if err != nil { + return nil, fmt.Errorf("invalid input codepage: %s", part) + } + settings.InputCodePage = cp + } else if strings.HasPrefix(strings.ToLower(part), "o:") { + // Output codepage + cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "o:")) + if err != nil { + return nil, fmt.Errorf("invalid output codepage: %s", part) + } + settings.OutputCodePage = cp + } else { + // Both input and output + cp, err := strconv.Atoi(part) + if err != nil { + return nil, fmt.Errorf("invalid codepage: %s", part) + } + settings.InputCodePage = cp + settings.OutputCodePage = cp + } + } + + // Validate codepages + if settings.InputCodePage != 0 { + if _, err := GetEncoding(settings.InputCodePage); err != nil { + return nil, err + } + } + if settings.OutputCodePage != 0 { + if _, err := GetEncoding(settings.OutputCodePage); err != nil { + return nil, err + } + } + + return settings, nil +} + +// GetEncoding returns the encoding for a given Windows codepage number. +// Returns nil for UTF-8 (65001) since Go uses UTF-8 natively. +func GetEncoding(codepage int) (encoding.Encoding, error) { + switch codepage { + // Unicode encodings + case 65001: + // UTF-8 - Go's native encoding, return nil to indicate no transformation needed + return nil, nil + case 1200: + // UTF-16LE + return unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), nil + case 1201: + // UTF-16BE + return unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), nil + + // OEM/DOS codepages + case 437: + return charmap.CodePage437, nil + case 850: + return charmap.CodePage850, nil + case 852: + return charmap.CodePage852, nil + case 855: + return charmap.CodePage855, nil + case 858: + return charmap.CodePage858, nil + case 860: + return charmap.CodePage860, nil + case 862: + return charmap.CodePage862, nil + case 863: + return charmap.CodePage863, nil + case 865: + return charmap.CodePage865, nil + case 866: + return charmap.CodePage866, nil + + // Windows codepages + case 874: + return charmap.Windows874, nil + case 1250: + return charmap.Windows1250, nil + case 1251: + return charmap.Windows1251, nil + case 1252: + return charmap.Windows1252, nil + case 1253: + return charmap.Windows1253, nil + case 1254: + return charmap.Windows1254, nil + case 1255: + return charmap.Windows1255, nil + case 1256: + return charmap.Windows1256, nil + case 1257: + return charmap.Windows1257, nil + case 1258: + return charmap.Windows1258, nil + + // ISO-8859 codepages + case 28591: + return charmap.ISO8859_1, nil + case 28592: + return charmap.ISO8859_2, nil + case 28593: + return charmap.ISO8859_3, nil + case 28594: + return charmap.ISO8859_4, nil + case 28595: + return charmap.ISO8859_5, nil + case 28596: + return charmap.ISO8859_6, nil + case 28597: + return charmap.ISO8859_7, nil + case 28598: + return charmap.ISO8859_8, nil + case 28599: + return charmap.ISO8859_9, nil + case 28600: + return charmap.ISO8859_10, nil + case 28603: + return charmap.ISO8859_13, nil + case 28604: + return charmap.ISO8859_14, nil + case 28605: + return charmap.ISO8859_15, nil + case 28606: + return charmap.ISO8859_16, nil + + // Cyrillic + case 20866: + return charmap.KOI8R, nil + case 21866: + return charmap.KOI8U, nil + + // Macintosh + case 10000: + return charmap.Macintosh, nil + case 10007: + return charmap.MacintoshCyrillic, nil + + // EBCDIC codepages + case 37: + return charmap.CodePage037, nil + case 1047: + return charmap.CodePage1047, nil + case 1140: + return charmap.CodePage1140, nil + + // Japanese + case 932: + // Shift JIS (Windows-31J) + return japanese.ShiftJIS, nil + case 20932: + // EUC-JP + return japanese.EUCJP, nil + case 50220, 50221, 50222: + // ISO-2022-JP + return japanese.ISO2022JP, nil + + // Korean + case 949: + // EUC-KR (Korean) + return korean.EUCKR, nil + case 51949: + // EUC-KR alternate + return korean.EUCKR, nil + + // Simplified Chinese + case 936: + // GBK (Simplified Chinese) + return simplifiedchinese.GBK, nil + case 54936: + // GB18030 + return simplifiedchinese.GB18030, nil + case 52936: + // HZ-GB2312 + return simplifiedchinese.HZGB2312, nil + + // Traditional Chinese + case 950: + // Big5 + return traditionalchinese.Big5, nil + + default: + return nil, fmt.Errorf("unsupported codepage %d", codepage) + } +} + +// CodePageInfo describes a supported codepage +type CodePageInfo struct { + CodePage int + Name string + Description string +} + +// SupportedCodePages returns a list of all supported codepages with descriptions +func SupportedCodePages() []CodePageInfo { + return []CodePageInfo{ + // Unicode + {65001, "UTF-8", "Unicode (UTF-8)"}, + {1200, "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, + {1201, "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, + + // OEM/DOS codepages + {437, "CP437", "OEM United States"}, + {850, "CP850", "OEM Multilingual Latin 1"}, + {852, "CP852", "OEM Latin 2"}, + {855, "CP855", "OEM Cyrillic"}, + {858, "CP858", "OEM Multilingual Latin 1 + Euro"}, + {860, "CP860", "OEM Portuguese"}, + {862, "CP862", "OEM Hebrew"}, + {863, "CP863", "OEM Canadian French"}, + {865, "CP865", "OEM Nordic"}, + {866, "CP866", "OEM Russian"}, + + // Windows codepages + {874, "Windows-874", "Thai"}, + {1250, "Windows-1250", "Central European"}, + {1251, "Windows-1251", "Cyrillic"}, + {1252, "Windows-1252", "Western European"}, + {1253, "Windows-1253", "Greek"}, + {1254, "Windows-1254", "Turkish"}, + {1255, "Windows-1255", "Hebrew"}, + {1256, "Windows-1256", "Arabic"}, + {1257, "Windows-1257", "Baltic"}, + {1258, "Windows-1258", "Vietnamese"}, + + // ISO-8859 codepages + {28591, "ISO-8859-1", "Latin 1 (Western European)"}, + {28592, "ISO-8859-2", "Latin 2 (Central European)"}, + {28593, "ISO-8859-3", "Latin 3 (South European)"}, + {28594, "ISO-8859-4", "Latin 4 (North European)"}, + {28595, "ISO-8859-5", "Cyrillic"}, + {28596, "ISO-8859-6", "Arabic"}, + {28597, "ISO-8859-7", "Greek"}, + {28598, "ISO-8859-8", "Hebrew"}, + {28599, "ISO-8859-9", "Turkish"}, + {28600, "ISO-8859-10", "Nordic"}, + {28603, "ISO-8859-13", "Baltic"}, + {28604, "ISO-8859-14", "Celtic"}, + {28605, "ISO-8859-15", "Latin 9 (Western European with Euro)"}, + {28606, "ISO-8859-16", "Latin 10 (South-Eastern European)"}, + + // Cyrillic + {20866, "KOI8-R", "Russian"}, + {21866, "KOI8-U", "Ukrainian"}, + + // Macintosh + {10000, "Macintosh", "Mac Roman"}, + {10007, "x-mac-cyrillic", "Mac Cyrillic"}, + + // EBCDIC + {37, "IBM037", "EBCDIC US-Canada"}, + {1047, "IBM1047", "EBCDIC Latin 1/Open System"}, + {1140, "IBM01140", "EBCDIC US-Canada with Euro"}, + + // Japanese + {932, "Shift_JIS", "Japanese (Shift-JIS)"}, + {20932, "EUC-JP", "Japanese (EUC)"}, + {50220, "ISO-2022-JP", "Japanese (JIS)"}, + {50221, "csISO2022JP", "Japanese (JIS-Allow 1 byte Kana)"}, + {50222, "ISO-2022-JP", "Japanese (JIS-Allow 1 byte Kana SO/SI)"}, + + // Korean + {949, "EUC-KR", "Korean"}, + {51949, "EUC-KR", "Korean (EUC)"}, + + // Simplified Chinese + {936, "GBK", "Chinese Simplified (GBK)"}, + {54936, "GB18030", "Chinese Simplified (GB18030)"}, + {52936, "HZ-GB-2312", "Chinese Simplified (HZ)"}, + + // Traditional Chinese + {950, "Big5", "Chinese Traditional (Big5)"}, + } +} diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go new file mode 100644 index 00000000..63e295f5 --- /dev/null +++ b/pkg/sqlcmd/codepage_test.go @@ -0,0 +1,228 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +package sqlcmd + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseCodePage(t *testing.T) { + tests := []struct { + name string + arg string + wantInput int + wantOutput int + wantErr bool + errContains string + }{ + { + name: "empty string", + arg: "", + wantInput: 0, + wantOutput: 0, + wantErr: false, + }, + { + name: "single codepage sets both", + arg: "65001", + wantInput: 65001, + wantOutput: 65001, + wantErr: false, + }, + { + name: "input only", + arg: "i:1252", + wantInput: 1252, + wantOutput: 0, + wantErr: false, + }, + { + name: "output only", + arg: "o:65001", + wantInput: 0, + wantOutput: 65001, + wantErr: false, + }, + { + name: "input and output", + arg: "i:1252,o:65001", + wantInput: 1252, + wantOutput: 65001, + wantErr: false, + }, + { + name: "output and input reversed", + arg: "o:65001,i:1252", + wantInput: 1252, + wantOutput: 65001, + wantErr: false, + }, + { + name: "uppercase prefix", + arg: "I:1252,O:65001", + wantInput: 1252, + wantOutput: 65001, + wantErr: false, + }, + { + name: "invalid codepage number", + arg: "abc", + wantErr: true, + errContains: "invalid codepage", + }, + { + name: "invalid input codepage", + arg: "i:abc", + wantErr: true, + errContains: "invalid input codepage", + }, + { + name: "invalid output codepage", + arg: "o:xyz", + wantErr: true, + errContains: "invalid output codepage", + }, + { + name: "unsupported codepage", + arg: "99999", + wantErr: true, + errContains: "unsupported codepage", + }, + { + name: "Japanese Shift JIS", + arg: "932", + wantInput: 932, + wantOutput: 932, + wantErr: false, + }, + { + name: "Chinese GBK", + arg: "936", + wantInput: 936, + wantOutput: 936, + wantErr: false, + }, + { + name: "Korean", + arg: "949", + wantInput: 949, + wantOutput: 949, + wantErr: false, + }, + { + name: "Traditional Chinese Big5", + arg: "950", + wantInput: 950, + wantOutput: 950, + wantErr: false, + }, + { + name: "EBCDIC", + arg: "37", + wantInput: 37, + wantOutput: 37, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + settings, err := ParseCodePage(tt.arg) + if tt.wantErr { + assert.Error(t, err) + if tt.errContains != "" { + assert.Contains(t, err.Error(), tt.errContains) + } + return + } + assert.NoError(t, err) + if tt.arg == "" { + assert.Nil(t, settings) + return + } + assert.NotNil(t, settings) + assert.Equal(t, tt.wantInput, settings.InputCodePage) + assert.Equal(t, tt.wantOutput, settings.OutputCodePage) + }) + } +} + +func TestGetEncoding(t *testing.T) { + tests := []struct { + codepage int + wantNil bool // UTF-8 returns nil encoding + wantErr bool + }{ + // Unicode + {65001, true, false}, // UTF-8 + {1200, false, false}, // UTF-16LE + {1201, false, false}, // UTF-16BE + + // OEM/DOS + {437, false, false}, + {850, false, false}, + {866, false, false}, + + // Windows + {874, false, false}, + {1250, false, false}, + {1251, false, false}, + {1252, false, false}, + {1253, false, false}, + {1254, false, false}, + {1255, false, false}, + {1256, false, false}, + {1257, false, false}, + {1258, false, false}, + + // ISO-8859 + {28591, false, false}, + {28592, false, false}, + {28605, false, false}, + + // Cyrillic + {20866, false, false}, + {21866, false, false}, + + // Macintosh + {10000, false, false}, + {10007, false, false}, + + // EBCDIC + {37, false, false}, + {1047, false, false}, + {1140, false, false}, + + // CJK + {932, false, false}, // Japanese Shift JIS + {20932, false, false}, // EUC-JP + {50220, false, false}, // ISO-2022-JP + {949, false, false}, // Korean EUC-KR + {936, false, false}, // Chinese GBK + {54936, false, false}, // GB18030 + {950, false, false}, // Big5 + + // Invalid + {99999, false, true}, + {12345, false, true}, + } + + for _, tt := range tests { + t.Run(string(rune(tt.codepage)), func(t *testing.T) { + enc, err := GetEncoding(tt.codepage) + if tt.wantErr { + assert.Error(t, err) + return + } + assert.NoError(t, err) + if tt.wantNil { + assert.Nil(t, enc, "UTF-8 should return nil encoding") + } else { + assert.NotNil(t, enc, "non-UTF-8 codepage should return encoding") + } + }) + } +} diff --git a/pkg/sqlcmd/commands.go b/pkg/sqlcmd/commands.go index 66dd1dba..44af7ed7 100644 --- a/pkg/sqlcmd/commands.go +++ b/pkg/sqlcmd/commands.go @@ -326,6 +326,20 @@ func outCommand(s *Sqlcmd, args []string, line uint) error { win16le := unicode.UTF16(unicode.LittleEndian, unicode.UseBOM) encoder := transform.NewWriter(o, win16le.NewEncoder()) s.SetOutput(encoder) + } else if s.CodePage != nil && s.CodePage.OutputCodePage != 0 { + // Use specified output codepage + enc, err := GetEncoding(s.CodePage.OutputCodePage) + if err != nil { + return err + } + if enc != nil { + // Transform from UTF-8 to specified encoding + encoder := transform.NewWriter(o, enc.NewEncoder()) + s.SetOutput(encoder) + } else { + // UTF-8, no transformation needed + s.SetOutput(o) + } } else { s.SetOutput(o) } @@ -352,7 +366,18 @@ func errorCommand(s *Sqlcmd, args []string, line uint) error { if err != nil { return InvalidFileError(err, args[0]) } - s.SetError(o) + // Apply output codepage if configured + if s.CodePage != nil && s.CodePage.OutputCodePage != 0 { + enc, err := GetEncoding(s.CodePage.OutputCodePage) + if err != nil { + o.Close() + return err + } + encoder := transform.NewWriter(o, enc.NewEncoder()) + s.SetError(encoder) + } else { + s.SetError(o) + } } return nil } diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index 5e572a94..da41dd76 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -86,6 +86,8 @@ type Sqlcmd struct { UnicodeOutputFile bool // EchoInput tells the GO command to print the batch text before running the query EchoInput bool + // CodePage specifies input/output file encoding + CodePage *CodePageSettings colorizer color.Colorizer termchan chan os.Signal } @@ -331,9 +333,26 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { } defer f.Close() b := s.batch.batchline - utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) - unicodeReader := transform.NewReader(f, utf16bom) - scanner := bufio.NewReader(unicodeReader) + + // Set up the reader with appropriate encoding + var reader io.Reader = f + if s.CodePage != nil && s.CodePage.InputCodePage != 0 { + // Use specified input codepage + enc, err := GetEncoding(s.CodePage.InputCodePage) + if err != nil { + return err + } + if enc != nil { + // Transform from specified encoding to UTF-8 + reader = transform.NewReader(f, enc.NewDecoder()) + } + // If enc is nil, it's UTF-8, no transformation needed + } else { + // Default: auto-detect BOM for UTF-16, fallback to UTF-8 + utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) + reader = transform.NewReader(f, utf16bom) + } + scanner := bufio.NewReader(reader) curLine := s.batch.read echoFileLines := s.echoFileLines ln := make([]byte, 0, 2*1024*1024) From 8a74e3baa1b503f76d97cae7f314f8acf6a8359e Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 12:59:13 -0600 Subject: [PATCH 02/13] Address Copilot review comments on codepage support - Fix nil encoding panic in errorCommand when OutputCodePage is 65001 (UTF-8) - Close file handle in outCommand when GetEncoding returns an error - Handle close error properly in errorCommand - Apply UTF-8 BOM stripping when input codepage is 65001 - Fix test subtest names to use strconv.Itoa instead of string(rune) --- pkg/sqlcmd/codepage_test.go | 3 ++- pkg/sqlcmd/commands.go | 14 +++++++++++--- pkg/sqlcmd/sqlcmd.go | 5 ++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index 63e295f5..01459bf1 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -4,6 +4,7 @@ package sqlcmd import ( + "strconv" "testing" "github.com/stretchr/testify/assert" @@ -211,7 +212,7 @@ func TestGetEncoding(t *testing.T) { } for _, tt := range tests { - t.Run(string(rune(tt.codepage)), func(t *testing.T) { + t.Run(strconv.Itoa(tt.codepage), func(t *testing.T) { enc, err := GetEncoding(tt.codepage) if tt.wantErr { assert.Error(t, err) diff --git a/pkg/sqlcmd/commands.go b/pkg/sqlcmd/commands.go index 44af7ed7..ad5a9095 100644 --- a/pkg/sqlcmd/commands.go +++ b/pkg/sqlcmd/commands.go @@ -330,6 +330,7 @@ func outCommand(s *Sqlcmd, args []string, line uint) error { // Use specified output codepage enc, err := GetEncoding(s.CodePage.OutputCodePage) if err != nil { + _ = o.Close() return err } if enc != nil { @@ -370,11 +371,18 @@ func errorCommand(s *Sqlcmd, args []string, line uint) error { if s.CodePage != nil && s.CodePage.OutputCodePage != 0 { enc, err := GetEncoding(s.CodePage.OutputCodePage) if err != nil { - o.Close() + if cerr := o.Close(); cerr != nil { + return fmt.Errorf("%v; additionally, closing error file %q failed: %w", err, args[0], cerr) + } return err } - encoder := transform.NewWriter(o, enc.NewEncoder()) - s.SetError(encoder) + if enc == nil { + // UTF-8 (or default) encoding: write directly without transform + s.SetError(o) + } else { + encoder := transform.NewWriter(o, enc.NewEncoder()) + s.SetError(encoder) + } } else { s.SetError(o) } diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index da41dd76..556e5e7d 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -345,8 +345,11 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { if enc != nil { // Transform from specified encoding to UTF-8 reader = transform.NewReader(f, enc.NewDecoder()) + } else { + // UTF-8 codepage: still apply BOM stripping + utf8bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) + reader = transform.NewReader(f, utf8bom) } - // If enc is nil, it's UTF-8, no transformation needed } else { // Default: auto-detect BOM for UTF-16, fallback to UTF-8 utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) From faa945c4e7e13eb9fef155b88da6bd2bb81769b0 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 13:12:12 -0600 Subject: [PATCH 03/13] Fix ineffectual assignment lint error --- pkg/sqlcmd/sqlcmd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index 556e5e7d..7a861c40 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -335,7 +335,7 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { b := s.batch.batchline // Set up the reader with appropriate encoding - var reader io.Reader = f + var reader io.Reader if s.CodePage != nil && s.CodePage.InputCodePage != 0 { // Use specified input codepage enc, err := GetEncoding(s.CodePage.InputCodePage) From 424cc32317086b704acff54cc6966181aa16d62b Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 13:26:29 -0600 Subject: [PATCH 04/13] Address Copilot review comments for codepage support - Use localizer.Errorf for all user-facing error messages - Fix UTF-16 BOM handling using ExpectBOM for input decoding - Add transformWriteCloser to properly close underlying file handles - Use transformWriteCloser in outCommand and errorCommand for both UnicodeOutputFile and CodePage transforms to prevent file handle leaks - Add integration tests for output/error codepage encoding --- pkg/sqlcmd/codepage.go | 18 +++---- pkg/sqlcmd/commands.go | 36 ++++++++++++-- pkg/sqlcmd/commands_test.go | 97 +++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 13 deletions(-) diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index 231940cb..383ae16e 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -4,10 +4,10 @@ package sqlcmd import ( - "fmt" "strconv" "strings" + "github.com/microsoft/go-sqlcmd/internal/localizer" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/japanese" @@ -43,21 +43,21 @@ func ParseCodePage(arg string) (*CodePageSettings, error) { // Input codepage cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "i:")) if err != nil { - return nil, fmt.Errorf("invalid input codepage: %s", part) + return nil, localizer.Errorf("invalid input codepage: %s", part) } settings.InputCodePage = cp } else if strings.HasPrefix(strings.ToLower(part), "o:") { // Output codepage cp, err := strconv.Atoi(strings.TrimPrefix(strings.ToLower(part), "o:")) if err != nil { - return nil, fmt.Errorf("invalid output codepage: %s", part) + return nil, localizer.Errorf("invalid output codepage: %s", part) } settings.OutputCodePage = cp } else { // Both input and output cp, err := strconv.Atoi(part) if err != nil { - return nil, fmt.Errorf("invalid codepage: %s", part) + return nil, localizer.Errorf("invalid codepage: %s", part) } settings.InputCodePage = cp settings.OutputCodePage = cp @@ -88,11 +88,11 @@ func GetEncoding(codepage int) (encoding.Encoding, error) { // UTF-8 - Go's native encoding, return nil to indicate no transformation needed return nil, nil case 1200: - // UTF-16LE - return unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), nil + // UTF-16LE - Use ExpectBOM to strip BOM if present during input + return unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM), nil case 1201: - // UTF-16BE - return unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), nil + // UTF-16BE - Use ExpectBOM to strip BOM if present during input + return unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM), nil // OEM/DOS codepages case 437: @@ -224,7 +224,7 @@ func GetEncoding(codepage int) (encoding.Encoding, error) { return traditionalchinese.Big5, nil default: - return nil, fmt.Errorf("unsupported codepage %d", codepage) + return nil, localizer.Errorf("unsupported codepage %d", codepage) } } diff --git a/pkg/sqlcmd/commands.go b/pkg/sqlcmd/commands.go index ad5a9095..c3938997 100644 --- a/pkg/sqlcmd/commands.go +++ b/pkg/sqlcmd/commands.go @@ -6,6 +6,7 @@ package sqlcmd import ( "flag" "fmt" + "io" "os" "regexp" "sort" @@ -13,10 +14,28 @@ import ( "strings" "github.com/microsoft/go-sqlcmd/internal/color" + "github.com/microsoft/go-sqlcmd/internal/localizer" "golang.org/x/text/encoding/unicode" "golang.org/x/text/transform" ) +// transformWriteCloser wraps a transform.Writer and ensures the underlying +// file is closed when Close() is called. +type transformWriteCloser struct { + *transform.Writer + underlying io.Closer +} + +// Close flushes the transform writer and closes the underlying file. +func (t *transformWriteCloser) Close() error { + // Close the transform writer (flushes pending data) + if err := t.Writer.Close(); err != nil { + _ = t.underlying.Close() + return err + } + return t.underlying.Close() +} + // Command defines a sqlcmd action which can be intermixed with the SQL batch // Commands for sqlcmd are defined at https://docs.microsoft.com/sql/tools/sqlcmd-utility#sqlcmd-commands type Command struct { @@ -324,7 +343,10 @@ func outCommand(s *Sqlcmd, args []string, line uint) error { // ODBC sqlcmd doesn't write a BOM but we will. // Maybe the endian-ness should be configurable. win16le := unicode.UTF16(unicode.LittleEndian, unicode.UseBOM) - encoder := transform.NewWriter(o, win16le.NewEncoder()) + encoder := &transformWriteCloser{ + Writer: transform.NewWriter(o, win16le.NewEncoder()), + underlying: o, + } s.SetOutput(encoder) } else if s.CodePage != nil && s.CodePage.OutputCodePage != 0 { // Use specified output codepage @@ -335,7 +357,10 @@ func outCommand(s *Sqlcmd, args []string, line uint) error { } if enc != nil { // Transform from UTF-8 to specified encoding - encoder := transform.NewWriter(o, enc.NewEncoder()) + encoder := &transformWriteCloser{ + Writer: transform.NewWriter(o, enc.NewEncoder()), + underlying: o, + } s.SetOutput(encoder) } else { // UTF-8, no transformation needed @@ -372,7 +397,7 @@ func errorCommand(s *Sqlcmd, args []string, line uint) error { enc, err := GetEncoding(s.CodePage.OutputCodePage) if err != nil { if cerr := o.Close(); cerr != nil { - return fmt.Errorf("%v; additionally, closing error file %q failed: %w", err, args[0], cerr) + return localizer.Errorf("%v; additionally, closing error file %q failed: %v", err, args[0], cerr) } return err } @@ -380,7 +405,10 @@ func errorCommand(s *Sqlcmd, args []string, line uint) error { // UTF-8 (or default) encoding: write directly without transform s.SetError(o) } else { - encoder := transform.NewWriter(o, enc.NewEncoder()) + encoder := &transformWriteCloser{ + Writer: transform.NewWriter(o, enc.NewEncoder()), + underlying: o, + } s.SetError(encoder) } } else { diff --git a/pkg/sqlcmd/commands_test.go b/pkg/sqlcmd/commands_test.go index 6197aa3f..76612b77 100644 --- a/pkg/sqlcmd/commands_test.go +++ b/pkg/sqlcmd/commands_test.go @@ -458,3 +458,100 @@ func TestExitCommandAppendsParameterToCurrentBatch(t *testing.T) { } } + +func TestOutputCodePageCommand(t *testing.T) { + tests := []struct { + name string + codepage int + expectedBytes []byte + inputText string + skipOnEncError bool + }{ + { + name: "UTF-8 output", + codepage: 65001, + inputText: "café", + expectedBytes: []byte("café"), + }, + { + name: "Windows-1252 output", + codepage: 1252, + inputText: "café", + expectedBytes: []byte{0x63, 0x61, 0x66, 0xe9}, // "café" in Windows-1252 + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s, buf := setupSqlCmdWithMemoryOutput(t) + defer buf.Close() + + // Set up codepage + s.CodePage = &CodePageSettings{ + OutputCodePage: tt.codepage, + } + + // Create temp file for output + file, err := os.CreateTemp("", "sqlcmdout") + require.NoError(t, err, "os.CreateTemp") + defer os.Remove(file.Name()) + fileName := file.Name() + _ = file.Close() + + // Run the OUT command + err = outCommand(s, []string{fileName}, 1) + require.NoError(t, err, "outCommand") + + // Write some text + _, err = s.GetOutput().Write([]byte(tt.inputText)) + require.NoError(t, err, "Write") + + // Close to flush + if closer, ok := s.GetOutput().(interface{ Close() error }); ok { + require.NoError(t, closer.Close(), "Close output") + } + + // Read the file and check encoding + content, err := os.ReadFile(fileName) + require.NoError(t, err, "ReadFile") + assert.Equal(t, tt.expectedBytes, content, "Output encoding mismatch") + }) + } +} + +func TestErrorCodePageCommand(t *testing.T) { + s, buf := setupSqlCmdWithMemoryOutput(t) + defer buf.Close() + + // Set up codepage for Windows-1252 + s.CodePage = &CodePageSettings{ + OutputCodePage: 1252, + } + + // Create temp file for error output + file, err := os.CreateTemp("", "sqlcmderr") + require.NoError(t, err, "os.CreateTemp") + defer os.Remove(file.Name()) + fileName := file.Name() + _ = file.Close() + + // Run the ERROR command + err = errorCommand(s, []string{fileName}, 1) + require.NoError(t, err, "errorCommand") + + // Write some text with special characters + _, err = s.err.Write([]byte("Error: café")) + require.NoError(t, err, "Write") + + // Close to flush + if closer, ok := s.err.(interface{ Close() error }); ok { + require.NoError(t, closer.Close(), "Close error") + } + + // Read the file and check encoding + content, err := os.ReadFile(fileName) + require.NoError(t, err, "ReadFile") + // "Error: café" in Windows-1252 + expected := []byte{0x45, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x63, 0x61, 0x66, 0xe9} + assert.Equal(t, expected, content, "Error output encoding mismatch") +} From 2a14b75846c1405c79a82c54faad0198a14e60da Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 13:40:01 -0600 Subject: [PATCH 05/13] Fix locale-specific number formatting in codepage error Use strconv.Itoa instead of %d to avoid locale-specific thousands separators in error message. --- pkg/sqlcmd/codepage.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index 383ae16e..bcde990b 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -224,7 +224,7 @@ func GetEncoding(codepage int) (encoding.Encoding, error) { return traditionalchinese.Big5, nil default: - return nil, localizer.Errorf("unsupported codepage %d", codepage) + return nil, localizer.Errorf("unsupported codepage %s", strconv.Itoa(codepage)) } } From 06da0443c9eff2b00190dc4c175ed8a5b1bc084c Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 14:49:51 -0600 Subject: [PATCH 06/13] refactor: consolidate codepage definitions into single registry - Create codepageRegistry map as single source of truth for codepages - GetEncoding() now uses the registry instead of switch statement - SupportedCodePages() now generates list from registry - Removes duplicate codepage definitions between the two functions - Sort SupportedCodePages result by codepage number for consistency --- pkg/sqlcmd/codepage.go | 323 +++++++++++++---------------------------- 1 file changed, 104 insertions(+), 219 deletions(-) diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index bcde990b..cced2691 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -4,6 +4,7 @@ package sqlcmd import ( + "sort" "strconv" "strings" @@ -17,6 +18,94 @@ import ( "golang.org/x/text/encoding/unicode" ) +// codepageEntry defines a codepage with its encoding and metadata +type codepageEntry struct { + encoding encoding.Encoding // nil for UTF-8 (Go's native encoding) + name string + description string +} + +// codepageRegistry is the single source of truth for all supported codepages. +// Both GetEncoding and SupportedCodePages use this registry. +var codepageRegistry = map[int]codepageEntry{ + // Unicode + 65001: {nil, "UTF-8", "Unicode (UTF-8)"}, + 1200: {unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, + 1201: {unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, + + // OEM/DOS codepages + 437: {charmap.CodePage437, "CP437", "OEM United States"}, + 850: {charmap.CodePage850, "CP850", "OEM Multilingual Latin 1"}, + 852: {charmap.CodePage852, "CP852", "OEM Latin 2"}, + 855: {charmap.CodePage855, "CP855", "OEM Cyrillic"}, + 858: {charmap.CodePage858, "CP858", "OEM Multilingual Latin 1 + Euro"}, + 860: {charmap.CodePage860, "CP860", "OEM Portuguese"}, + 862: {charmap.CodePage862, "CP862", "OEM Hebrew"}, + 863: {charmap.CodePage863, "CP863", "OEM Canadian French"}, + 865: {charmap.CodePage865, "CP865", "OEM Nordic"}, + 866: {charmap.CodePage866, "CP866", "OEM Russian"}, + + // Windows codepages + 874: {charmap.Windows874, "Windows-874", "Thai"}, + 1250: {charmap.Windows1250, "Windows-1250", "Central European"}, + 1251: {charmap.Windows1251, "Windows-1251", "Cyrillic"}, + 1252: {charmap.Windows1252, "Windows-1252", "Western European"}, + 1253: {charmap.Windows1253, "Windows-1253", "Greek"}, + 1254: {charmap.Windows1254, "Windows-1254", "Turkish"}, + 1255: {charmap.Windows1255, "Windows-1255", "Hebrew"}, + 1256: {charmap.Windows1256, "Windows-1256", "Arabic"}, + 1257: {charmap.Windows1257, "Windows-1257", "Baltic"}, + 1258: {charmap.Windows1258, "Windows-1258", "Vietnamese"}, + + // ISO-8859 codepages + 28591: {charmap.ISO8859_1, "ISO-8859-1", "Latin 1 (Western European)"}, + 28592: {charmap.ISO8859_2, "ISO-8859-2", "Latin 2 (Central European)"}, + 28593: {charmap.ISO8859_3, "ISO-8859-3", "Latin 3 (South European)"}, + 28594: {charmap.ISO8859_4, "ISO-8859-4", "Latin 4 (North European)"}, + 28595: {charmap.ISO8859_5, "ISO-8859-5", "Cyrillic"}, + 28596: {charmap.ISO8859_6, "ISO-8859-6", "Arabic"}, + 28597: {charmap.ISO8859_7, "ISO-8859-7", "Greek"}, + 28598: {charmap.ISO8859_8, "ISO-8859-8", "Hebrew"}, + 28599: {charmap.ISO8859_9, "ISO-8859-9", "Turkish"}, + 28600: {charmap.ISO8859_10, "ISO-8859-10", "Nordic"}, + 28603: {charmap.ISO8859_13, "ISO-8859-13", "Baltic"}, + 28604: {charmap.ISO8859_14, "ISO-8859-14", "Celtic"}, + 28605: {charmap.ISO8859_15, "ISO-8859-15", "Latin 9 (Western European with Euro)"}, + 28606: {charmap.ISO8859_16, "ISO-8859-16", "Latin 10 (South-Eastern European)"}, + + // Cyrillic + 20866: {charmap.KOI8R, "KOI8-R", "Russian"}, + 21866: {charmap.KOI8U, "KOI8-U", "Ukrainian"}, + + // Macintosh + 10000: {charmap.Macintosh, "Macintosh", "Mac Roman"}, + 10007: {charmap.MacintoshCyrillic, "x-mac-cyrillic", "Mac Cyrillic"}, + + // EBCDIC + 37: {charmap.CodePage037, "IBM037", "EBCDIC US-Canada"}, + 1047: {charmap.CodePage1047, "IBM1047", "EBCDIC Latin 1/Open System"}, + 1140: {charmap.CodePage1140, "IBM01140", "EBCDIC US-Canada with Euro"}, + + // Japanese + 932: {japanese.ShiftJIS, "Shift_JIS", "Japanese (Shift-JIS)"}, + 20932: {japanese.EUCJP, "EUC-JP", "Japanese (EUC)"}, + 50220: {japanese.ISO2022JP, "ISO-2022-JP", "Japanese (JIS)"}, + 50221: {japanese.ISO2022JP, "csISO2022JP", "Japanese (JIS-Allow 1 byte Kana)"}, + 50222: {japanese.ISO2022JP, "ISO-2022-JP", "Japanese (JIS-Allow 1 byte Kana SO/SI)"}, + + // Korean + 949: {korean.EUCKR, "EUC-KR", "Korean"}, + 51949: {korean.EUCKR, "EUC-KR", "Korean (EUC)"}, + + // Simplified Chinese + 936: {simplifiedchinese.GBK, "GBK", "Chinese Simplified (GBK)"}, + 54936: {simplifiedchinese.GB18030, "GB18030", "Chinese Simplified (GB18030)"}, + 52936: {simplifiedchinese.HZGB2312, "HZ-GB-2312", "Chinese Simplified (HZ)"}, + + // Traditional Chinese + 950: {traditionalchinese.Big5, "Big5", "Chinese Traditional (Big5)"}, +} + // CodePageSettings holds the input and output codepage settings type CodePageSettings struct { InputCodePage int @@ -82,150 +171,11 @@ func ParseCodePage(arg string) (*CodePageSettings, error) { // GetEncoding returns the encoding for a given Windows codepage number. // Returns nil for UTF-8 (65001) since Go uses UTF-8 natively. func GetEncoding(codepage int) (encoding.Encoding, error) { - switch codepage { - // Unicode encodings - case 65001: - // UTF-8 - Go's native encoding, return nil to indicate no transformation needed - return nil, nil - case 1200: - // UTF-16LE - Use ExpectBOM to strip BOM if present during input - return unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM), nil - case 1201: - // UTF-16BE - Use ExpectBOM to strip BOM if present during input - return unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM), nil - - // OEM/DOS codepages - case 437: - return charmap.CodePage437, nil - case 850: - return charmap.CodePage850, nil - case 852: - return charmap.CodePage852, nil - case 855: - return charmap.CodePage855, nil - case 858: - return charmap.CodePage858, nil - case 860: - return charmap.CodePage860, nil - case 862: - return charmap.CodePage862, nil - case 863: - return charmap.CodePage863, nil - case 865: - return charmap.CodePage865, nil - case 866: - return charmap.CodePage866, nil - - // Windows codepages - case 874: - return charmap.Windows874, nil - case 1250: - return charmap.Windows1250, nil - case 1251: - return charmap.Windows1251, nil - case 1252: - return charmap.Windows1252, nil - case 1253: - return charmap.Windows1253, nil - case 1254: - return charmap.Windows1254, nil - case 1255: - return charmap.Windows1255, nil - case 1256: - return charmap.Windows1256, nil - case 1257: - return charmap.Windows1257, nil - case 1258: - return charmap.Windows1258, nil - - // ISO-8859 codepages - case 28591: - return charmap.ISO8859_1, nil - case 28592: - return charmap.ISO8859_2, nil - case 28593: - return charmap.ISO8859_3, nil - case 28594: - return charmap.ISO8859_4, nil - case 28595: - return charmap.ISO8859_5, nil - case 28596: - return charmap.ISO8859_6, nil - case 28597: - return charmap.ISO8859_7, nil - case 28598: - return charmap.ISO8859_8, nil - case 28599: - return charmap.ISO8859_9, nil - case 28600: - return charmap.ISO8859_10, nil - case 28603: - return charmap.ISO8859_13, nil - case 28604: - return charmap.ISO8859_14, nil - case 28605: - return charmap.ISO8859_15, nil - case 28606: - return charmap.ISO8859_16, nil - - // Cyrillic - case 20866: - return charmap.KOI8R, nil - case 21866: - return charmap.KOI8U, nil - - // Macintosh - case 10000: - return charmap.Macintosh, nil - case 10007: - return charmap.MacintoshCyrillic, nil - - // EBCDIC codepages - case 37: - return charmap.CodePage037, nil - case 1047: - return charmap.CodePage1047, nil - case 1140: - return charmap.CodePage1140, nil - - // Japanese - case 932: - // Shift JIS (Windows-31J) - return japanese.ShiftJIS, nil - case 20932: - // EUC-JP - return japanese.EUCJP, nil - case 50220, 50221, 50222: - // ISO-2022-JP - return japanese.ISO2022JP, nil - - // Korean - case 949: - // EUC-KR (Korean) - return korean.EUCKR, nil - case 51949: - // EUC-KR alternate - return korean.EUCKR, nil - - // Simplified Chinese - case 936: - // GBK (Simplified Chinese) - return simplifiedchinese.GBK, nil - case 54936: - // GB18030 - return simplifiedchinese.GB18030, nil - case 52936: - // HZ-GB2312 - return simplifiedchinese.HZGB2312, nil - - // Traditional Chinese - case 950: - // Big5 - return traditionalchinese.Big5, nil - - default: + entry, ok := codepageRegistry[codepage] + if !ok { return nil, localizer.Errorf("unsupported codepage %s", strconv.Itoa(codepage)) } + return entry.encoding, nil } // CodePageInfo describes a supported codepage @@ -237,82 +187,17 @@ type CodePageInfo struct { // SupportedCodePages returns a list of all supported codepages with descriptions func SupportedCodePages() []CodePageInfo { - return []CodePageInfo{ - // Unicode - {65001, "UTF-8", "Unicode (UTF-8)"}, - {1200, "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, - {1201, "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, - - // OEM/DOS codepages - {437, "CP437", "OEM United States"}, - {850, "CP850", "OEM Multilingual Latin 1"}, - {852, "CP852", "OEM Latin 2"}, - {855, "CP855", "OEM Cyrillic"}, - {858, "CP858", "OEM Multilingual Latin 1 + Euro"}, - {860, "CP860", "OEM Portuguese"}, - {862, "CP862", "OEM Hebrew"}, - {863, "CP863", "OEM Canadian French"}, - {865, "CP865", "OEM Nordic"}, - {866, "CP866", "OEM Russian"}, - - // Windows codepages - {874, "Windows-874", "Thai"}, - {1250, "Windows-1250", "Central European"}, - {1251, "Windows-1251", "Cyrillic"}, - {1252, "Windows-1252", "Western European"}, - {1253, "Windows-1253", "Greek"}, - {1254, "Windows-1254", "Turkish"}, - {1255, "Windows-1255", "Hebrew"}, - {1256, "Windows-1256", "Arabic"}, - {1257, "Windows-1257", "Baltic"}, - {1258, "Windows-1258", "Vietnamese"}, - - // ISO-8859 codepages - {28591, "ISO-8859-1", "Latin 1 (Western European)"}, - {28592, "ISO-8859-2", "Latin 2 (Central European)"}, - {28593, "ISO-8859-3", "Latin 3 (South European)"}, - {28594, "ISO-8859-4", "Latin 4 (North European)"}, - {28595, "ISO-8859-5", "Cyrillic"}, - {28596, "ISO-8859-6", "Arabic"}, - {28597, "ISO-8859-7", "Greek"}, - {28598, "ISO-8859-8", "Hebrew"}, - {28599, "ISO-8859-9", "Turkish"}, - {28600, "ISO-8859-10", "Nordic"}, - {28603, "ISO-8859-13", "Baltic"}, - {28604, "ISO-8859-14", "Celtic"}, - {28605, "ISO-8859-15", "Latin 9 (Western European with Euro)"}, - {28606, "ISO-8859-16", "Latin 10 (South-Eastern European)"}, - - // Cyrillic - {20866, "KOI8-R", "Russian"}, - {21866, "KOI8-U", "Ukrainian"}, - - // Macintosh - {10000, "Macintosh", "Mac Roman"}, - {10007, "x-mac-cyrillic", "Mac Cyrillic"}, - - // EBCDIC - {37, "IBM037", "EBCDIC US-Canada"}, - {1047, "IBM1047", "EBCDIC Latin 1/Open System"}, - {1140, "IBM01140", "EBCDIC US-Canada with Euro"}, - - // Japanese - {932, "Shift_JIS", "Japanese (Shift-JIS)"}, - {20932, "EUC-JP", "Japanese (EUC)"}, - {50220, "ISO-2022-JP", "Japanese (JIS)"}, - {50221, "csISO2022JP", "Japanese (JIS-Allow 1 byte Kana)"}, - {50222, "ISO-2022-JP", "Japanese (JIS-Allow 1 byte Kana SO/SI)"}, - - // Korean - {949, "EUC-KR", "Korean"}, - {51949, "EUC-KR", "Korean (EUC)"}, - - // Simplified Chinese - {936, "GBK", "Chinese Simplified (GBK)"}, - {54936, "GB18030", "Chinese Simplified (GB18030)"}, - {52936, "HZ-GB-2312", "Chinese Simplified (HZ)"}, - - // Traditional Chinese - {950, "Big5", "Chinese Traditional (Big5)"}, + result := make([]CodePageInfo, 0, len(codepageRegistry)) + for cp, entry := range codepageRegistry { + result = append(result, CodePageInfo{ + CodePage: cp, + Name: entry.name, + Description: entry.description, + }) } + // Sort by codepage number for consistent output + sort.Slice(result, func(i, j int) bool { + return result[i].CodePage < result[j].CodePage + }) + return result } From 065b1f45f3750955ad7088cf88cb4fa496c4e851 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 14:50:27 -0600 Subject: [PATCH 07/13] test: add TestSupportedCodePages to verify registry consistency - Verify all returned codepages are valid in GetEncoding - Ensure results are sorted by codepage number - Check well-known codepages are present --- pkg/sqlcmd/codepage_test.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index 01459bf1..47f7fae3 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -227,3 +227,39 @@ func TestGetEncoding(t *testing.T) { }) } } + +func TestSupportedCodePages(t *testing.T) { + cps := SupportedCodePages() + + // Should have entries + assert.Greater(t, len(cps), 0, "should return codepages") + + // Each returned codepage should be valid in GetEncoding + for _, cp := range cps { + _, err := GetEncoding(cp.CodePage) + assert.NoError(t, err, "SupportedCodePages entry %d should be valid in GetEncoding", cp.CodePage) + assert.NotEmpty(t, cp.Name, "codepage %d should have a name", cp.CodePage) + assert.NotEmpty(t, cp.Description, "codepage %d should have a description", cp.CodePage) + } + + // Result should be sorted by codepage number + for i := 1; i < len(cps); i++ { + assert.Less(t, cps[i-1].CodePage, cps[i].CodePage, "codepages should be sorted") + } + + // Check some well-known codepages are present + known := map[int]bool{ + 65001: false, // UTF-8 + 1252: false, // Windows Western + 437: false, // DOS US + 932: false, // Japanese + } + for _, cp := range cps { + if _, ok := known[cp.CodePage]; ok { + known[cp.CodePage] = true + } + } + for cp, found := range known { + assert.True(t, found, "well-known codepage %d should be in list", cp) + } +} From 9b0cd9144bdb9a5fad2c2f55746ca4c715855642 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 16:53:38 -0600 Subject: [PATCH 08/13] Fix UTF-16 BOM handling and add Windows codepage fallback - Fix IncludeFile to strip BOM from UTF-16 encoded files using BOMOverride - Add Windows API fallback (MultiByteToWideChar/WideCharToMultiByte) for codepages not in built-in registry (e.g., Japanese EBCDIC 20290) - Add helpful error on non-Windows when codepage not in registry - Add TestIncludeFileWithInputCodePage for Windows-1252 and UTF-16 LE/BE - Add TestGetEncodingWindowsFallback to verify Windows API fallback - Update README.md to document Windows codepage availability - Update code comments to document cross-platform vs Windows-only support --- README.md | 2 +- pkg/sqlcmd/codepage.go | 11 +- pkg/sqlcmd/codepage_other.go | 19 ++++ pkg/sqlcmd/codepage_test.go | 37 ++++++- pkg/sqlcmd/codepage_windows.go | 183 +++++++++++++++++++++++++++++++++ pkg/sqlcmd/sqlcmd.go | 8 +- pkg/sqlcmd/sqlcmd_test.go | 64 ++++++++++++ 7 files changed, 318 insertions(+), 6 deletions(-) create mode 100644 pkg/sqlcmd/codepage_other.go create mode 100644 pkg/sqlcmd/codepage_windows.go diff --git a/README.md b/README.md index 5397d79e..0c174a2a 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ The following switches have different behavior in this version of `sqlcmd` compa - To provide the value of the host name in the server certificate when using strict encryption, pass the host name with `-F`. Example: `-Ns -F myhost.domain.com` - More information about client/server encryption negotiation can be found at - `-u` The generated Unicode output file will have the UTF16 Little-Endian Byte-order mark (BOM) written to it. -- `-f` Specifies the code page for input and output files. Format: `codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage]`. Use `65001` for UTF-8. Supported codepages include Unicode (65001, 1200, 1201), Windows (874, 1250-1258), OEM/DOS (437, 850, etc.), ISO-8859 (28591-28606), CJK (932, 936, 949, 950), and EBCDIC (37, 1047, 1140). Use `--list-codepages` to see all supported code pages. +- `-f` Specifies the code page for input and output files. Format: `codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage]`. Use `65001` for UTF-8. Supported codepages include Unicode (65001, 1200, 1201), Windows (874, 1250-1258), OEM/DOS (437, 850, etc.), ISO-8859 (28591-28606), CJK (932, 936, 949, 950), and EBCDIC (37, 1047, 1140). On Windows, additional codepages installed on the system (such as Japanese EBCDIC) are also available. Use `--list-codepages` to see all supported code pages. - Some behaviors that were kept to maintain compatibility with `OSQL` may be changed, such as alignment of column headers for some data types. - All commands must fit on one line, even `EXIT`. Interactive mode will not check for open parentheses or quotes for commands and prompt for successive lines. The ODBC sqlcmd allows the query run by `EXIT(query)` to span multiple lines. - `-i` doesn't handle a comma `,` in a file name correctly unless the file name argument is triple quoted. For example: diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index cced2691..1466bbf4 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -25,8 +25,10 @@ type codepageEntry struct { description string } -// codepageRegistry is the single source of truth for all supported codepages. -// Both GetEncoding and SupportedCodePages use this registry. +// codepageRegistry is the single source of truth for all supported codepages +// that work cross-platform. Both GetEncoding and SupportedCodePages use this +// registry. On Windows, additional codepages installed on the system are also +// available via the Windows API fallback in GetEncoding. var codepageRegistry = map[int]codepageEntry{ // Unicode 65001: {nil, "UTF-8", "Unicode (UTF-8)"}, @@ -170,10 +172,13 @@ func ParseCodePage(arg string) (*CodePageSettings, error) { // GetEncoding returns the encoding for a given Windows codepage number. // Returns nil for UTF-8 (65001) since Go uses UTF-8 natively. +// If the codepage is not in the built-in registry, falls back to +// OS-specific support (Windows API on Windows, error on other platforms). func GetEncoding(codepage int) (encoding.Encoding, error) { entry, ok := codepageRegistry[codepage] if !ok { - return nil, localizer.Errorf("unsupported codepage %s", strconv.Itoa(codepage)) + // Fallback to system-provided codepage support + return getSystemCodePageEncoding(codepage) } return entry.encoding, nil } diff --git a/pkg/sqlcmd/codepage_other.go b/pkg/sqlcmd/codepage_other.go new file mode 100644 index 00000000..c27f78dc --- /dev/null +++ b/pkg/sqlcmd/codepage_other.go @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build !windows + +package sqlcmd + +import ( + "github.com/microsoft/go-sqlcmd/internal/localizer" + "golang.org/x/text/encoding" +) + +// getSystemCodePageEncoding returns an error on non-Windows platforms +// since we don't have access to Windows API for codepage conversion. +// The built-in codepageRegistry covers the most common codepages. +// For additional codepages (e.g., Japanese EBCDIC), use Windows. +func getSystemCodePageEncoding(codepage int) (encoding.Encoding, error) { + return nil, localizer.Errorf("codepage %d is not supported on this platform; additional codepages are available on Windows", codepage) +} diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index 47f7fae3..748c9983 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -90,7 +90,7 @@ func TestParseCodePage(t *testing.T) { name: "unsupported codepage", arg: "99999", wantErr: true, - errContains: "unsupported codepage", + errContains: "codepage", // Error message varies by platform }, { name: "Japanese Shift JIS", @@ -263,3 +263,38 @@ func TestSupportedCodePages(t *testing.T) { assert.True(t, found, "well-known codepage %d should be in list", cp) } } + +func TestGetEncodingWindowsFallback(t *testing.T) { + // Japanese EBCDIC (20290) is not in our built-in registry but is available on Windows + // This test verifies that the Windows API fallback works for codepages not in our registry + cp := 20290 // IBM EBCDIC Japanese Katakana Extended + + enc, err := GetEncoding(cp) + + // On Windows, this should succeed because the Windows API can handle this codepage + // On other platforms, this should fail with a helpful error message + if err != nil { + // Expected on non-Windows platforms + assert.Contains(t, err.Error(), "codepage") + } else { + // Expected on Windows - verify the encoding works + assert.NotNil(t, enc) + + // Test round-trip encoding/decoding + // EBCDIC 'A' is 0xC1 + decoder := enc.NewDecoder() + decoded, err := decoder.String(string([]byte{0xC1})) + assert.NoError(t, err, "decoder should work") + assert.Equal(t, "A", decoded, "EBCDIC 0xC1 should decode to 'A'") + + encoder := enc.NewEncoder() + encoded, err := encoder.String("A") + assert.NoError(t, err, "encoder should work") + assert.Equal(t, []byte{0xC1}, []byte(encoded), "'A' should encode to EBCDIC 0xC1") + } + + // Also test that a completely made-up codepage fails on all platforms + _, err = GetEncoding(99999) + assert.Error(t, err, "invalid codepage should fail on all platforms") + assert.Contains(t, err.Error(), "codepage") +} diff --git a/pkg/sqlcmd/codepage_windows.go b/pkg/sqlcmd/codepage_windows.go new file mode 100644 index 00000000..e2394501 --- /dev/null +++ b/pkg/sqlcmd/codepage_windows.go @@ -0,0 +1,183 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +//go:build windows + +package sqlcmd + +import ( + "errors" + "unicode/utf16" + "unsafe" + + "github.com/microsoft/go-sqlcmd/internal/localizer" + "golang.org/x/sys/windows" + "golang.org/x/text/encoding" + "golang.org/x/text/transform" +) + +var ( + kernel32 = windows.NewLazySystemDLL("kernel32.dll") + procMultiByteToWideChar = kernel32.NewProc("MultiByteToWideChar") + procWideCharToMultiByte = kernel32.NewProc("WideCharToMultiByte") +) + +// windowsCodePageEncoding implements encoding.Encoding using Windows API +type windowsCodePageEncoding struct { + codepage uint32 +} + +func (e *windowsCodePageEncoding) NewDecoder() *encoding.Decoder { + return &encoding.Decoder{Transformer: &windowsDecoder{codepage: e.codepage}} +} + +func (e *windowsCodePageEncoding) NewEncoder() *encoding.Encoder { + return &encoding.Encoder{Transformer: &windowsEncoder{codepage: e.codepage}} +} + +// windowsDecoder converts from a Windows codepage to UTF-8 +type windowsDecoder struct { + codepage uint32 +} + +func (d *windowsDecoder) Reset() {} + +func (d *windowsDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + if len(src) == 0 { + return 0, 0, nil + } + + // First call to get required buffer size for wide chars + n, _, errno := procMultiByteToWideChar.Call( + uintptr(d.codepage), + 0, + uintptr(unsafe.Pointer(&src[0])), + uintptr(len(src)), + 0, + 0, + ) + if n == 0 { + if errno != windows.ERROR_SUCCESS { + return 0, 0, errno + } + return 0, 0, errors.New("MultiByteToWideChar failed") + } + + // Allocate wide char buffer + wideChars := make([]uint16, n) + + // Convert to wide chars + n, _, errno = procMultiByteToWideChar.Call( + uintptr(d.codepage), + 0, + uintptr(unsafe.Pointer(&src[0])), + uintptr(len(src)), + uintptr(unsafe.Pointer(&wideChars[0])), + uintptr(len(wideChars)), + ) + if n == 0 { + if errno != windows.ERROR_SUCCESS { + return 0, 0, errno + } + return 0, 0, errors.New("MultiByteToWideChar failed") + } + + // Convert UTF-16 to UTF-8 + runes := utf16.Decode(wideChars[:n]) + utf8Bytes := []byte(string(runes)) + + if len(utf8Bytes) > len(dst) { + return 0, 0, transform.ErrShortDst + } + + copy(dst, utf8Bytes) + return len(utf8Bytes), len(src), nil +} + +// windowsEncoder converts from UTF-8 to a Windows codepage +type windowsEncoder struct { + codepage uint32 +} + +func (e *windowsEncoder) Reset() {} + +func (e *windowsEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + if len(src) == 0 { + return 0, 0, nil + } + + // Convert UTF-8 to UTF-16 + runes := []rune(string(src)) + wideChars := utf16.Encode(runes) + + if len(wideChars) == 0 { + return 0, len(src), nil + } + + // First call to get required buffer size + n, _, errno := procWideCharToMultiByte.Call( + uintptr(e.codepage), + 0, + uintptr(unsafe.Pointer(&wideChars[0])), + uintptr(len(wideChars)), + 0, + 0, + 0, + 0, + ) + if n == 0 { + if errno != windows.ERROR_SUCCESS { + return 0, 0, errno + } + return 0, 0, errors.New("WideCharToMultiByte failed") + } + + if int(n) > len(dst) { + return 0, 0, transform.ErrShortDst + } + + // Convert to multibyte + n, _, errno = procWideCharToMultiByte.Call( + uintptr(e.codepage), + 0, + uintptr(unsafe.Pointer(&wideChars[0])), + uintptr(len(wideChars)), + uintptr(unsafe.Pointer(&dst[0])), + uintptr(len(dst)), + 0, + 0, + ) + if n == 0 { + if errno != windows.ERROR_SUCCESS { + return 0, 0, errno + } + return 0, 0, errors.New("WideCharToMultiByte failed") + } + + return int(n), len(src), nil +} + +// isCodePageValid checks if a codepage is valid/installed on Windows +func isCodePageValid(codepage uint32) bool { + // Try to convert a simple byte - if the codepage is invalid, this will fail + src := []byte{0x41} // 'A' + n, _, _ := procMultiByteToWideChar.Call( + uintptr(codepage), + 0, + uintptr(unsafe.Pointer(&src[0])), + 1, + 0, + 0, + ) + return n > 0 +} + +// getSystemCodePageEncoding returns an encoding using Windows API for codepages +// not in our built-in registry. Returns nil if the codepage is not available. +func getSystemCodePageEncoding(codepage int) (encoding.Encoding, error) { + cp := uint32(codepage) + if !isCodePageValid(cp) { + return nil, localizer.Errorf("codepage %d is not installed on this system", codepage) + } + return &windowsCodePageEncoding{codepage: cp}, nil +} diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index 7a861c40..221676e6 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -344,7 +344,13 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { } if enc != nil { // Transform from specified encoding to UTF-8 - reader = transform.NewReader(f, enc.NewDecoder()) + // For UTF-16 codepages, wrap with BOMOverride to strip BOM if present + if s.CodePage.InputCodePage == 1200 || s.CodePage.InputCodePage == 1201 { + // UTF-16 LE/BE: use BOMOverride to handle BOM gracefully + reader = transform.NewReader(f, unicode.BOMOverride(enc.NewDecoder())) + } else { + reader = transform.NewReader(f, enc.NewDecoder()) + } } else { // UTF-8 codepage: still apply BOM stripping utf8bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) diff --git a/pkg/sqlcmd/sqlcmd_test.go b/pkg/sqlcmd/sqlcmd_test.go index dfe97d1a..a28958ec 100644 --- a/pkg/sqlcmd/sqlcmd_test.go +++ b/pkg/sqlcmd/sqlcmd_test.go @@ -19,6 +19,7 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) const oneRowAffected = "(1 row affected)" @@ -232,6 +233,69 @@ func TestIncludeFileQuotedIdentifiers(t *testing.T) { } } +func TestIncludeFileWithInputCodePage(t *testing.T) { + tests := []struct { + name string + codepage int + fileContent []byte + expectedText string + }{ + { + name: "Windows-1252 input", + codepage: 1252, + fileContent: []byte{0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x20, 0x27, 0x63, 0x61, 0x66, 0xe9, 0x27}, // "select 'café'" in Windows-1252 + expectedText: "select 'café'", + }, + { + name: "UTF-16 LE with BOM", + codepage: 1200, + fileContent: []byte{0xFF, 0xFE, 0x68, 0x00, 0x69, 0x00}, // BOM + "hi" in UTF-16 LE + expectedText: "hi", + }, + { + name: "UTF-16 LE without BOM", + codepage: 1200, + fileContent: []byte{0x68, 0x00, 0x69, 0x00}, // "hi" in UTF-16 LE (no BOM) + expectedText: "hi", + }, + { + name: "UTF-16 BE with BOM", + codepage: 1201, + fileContent: []byte{0xFE, 0xFF, 0x00, 0x68, 0x00, 0x69}, // BOM + "hi" in UTF-16 BE + expectedText: "hi", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create temp file with encoded content + file, err := os.CreateTemp("", "sqlcmdinput*.sql") + require.NoError(t, err, "os.CreateTemp") + defer os.Remove(file.Name()) + + _, err = file.Write(tt.fileContent) + require.NoError(t, err, "Write") + err = file.Close() + require.NoError(t, err, "Close") + + // Set up Sqlcmd with InputCodePage + s, buf := setupSqlCmdWithMemoryOutput(t) + defer buf.Close() + s.CodePage = &CodePageSettings{ + InputCodePage: tt.codepage, + } + + // Include the file but don't execute (processAll=false) + err = s.IncludeFile(file.Name(), false) + require.NoError(t, err, "IncludeFile") + + // Check that the batch contains the expected decoded text + batchText := s.batch.String() + assert.Contains(t, batchText, tt.expectedText, "batch should contain decoded text") + }) + } +} + func TestGetRunnableQuery(t *testing.T) { v := InitializeVariables(false) v.Set("var1", "v1") From ff0561907f171079529eff8a3bfcef59a8c78b67 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 17:24:44 -0600 Subject: [PATCH 09/13] Address Copilot review comments for codepage implementation - Change UTF-16 BOM handling from IgnoreBOM to UseBOM for codepages 1200 (UTF-16 LE) and 1201 (UTF-16 BE) to properly strip BOMs on decode (pkg/sqlcmd/codepage.go) - Eliminate redundant CodePage parsing by storing parsed settings in codePageSettings field after validation in Validate(), then reusing in run() (cmd/sqlcmd/sqlcmd.go) - Add comprehensive Code Page Support documentation section to README.md with format guide, common codepages table, practical examples, and notes on default behavior Note: Integration test for IncludeFile with non-UTF8 input already exists in TestIncludeFileWithInputCodePage (pkg/sqlcmd/sqlcmd_test.go) --- README.md | 64 +++++++++++++++++++++++++++++++++++++++++- cmd/sqlcmd/sqlcmd.go | 19 +++++++------ pkg/sqlcmd/codepage.go | 4 +-- 3 files changed, 75 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 0c174a2a..ec53271c 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ The following switches have different behavior in this version of `sqlcmd` compa - To provide the value of the host name in the server certificate when using strict encryption, pass the host name with `-F`. Example: `-Ns -F myhost.domain.com` - More information about client/server encryption negotiation can be found at - `-u` The generated Unicode output file will have the UTF16 Little-Endian Byte-order mark (BOM) written to it. -- `-f` Specifies the code page for input and output files. Format: `codepage | i:codepage[,o:codepage] | o:codepage[,i:codepage]`. Use `65001` for UTF-8. Supported codepages include Unicode (65001, 1200, 1201), Windows (874, 1250-1258), OEM/DOS (437, 850, etc.), ISO-8859 (28591-28606), CJK (932, 936, 949, 950), and EBCDIC (37, 1047, 1140). On Windows, additional codepages installed on the system (such as Japanese EBCDIC) are also available. Use `--list-codepages` to see all supported code pages. +- `-f` Specifies the code page for input and output files. See [Code Page Support](#code-page-support) below for details and examples. - Some behaviors that were kept to maintain compatibility with `OSQL` may be changed, such as alignment of column headers for some data types. - All commands must fit on one line, even `EXIT`. Interactive mode will not check for open parentheses or quotes for commands and prompt for successive lines. The ODBC sqlcmd allows the query run by `EXIT(query)` to span multiple lines. - `-i` doesn't handle a comma `,` in a file name correctly unless the file name argument is triple quoted. For example: @@ -238,6 +238,68 @@ To see a list of available styles along with colored syntax samples, use this co :list color ``` +### Code Page Support + +The `-f` flag specifies the code page for reading input files and writing output. This is useful when working with SQL scripts saved in legacy encodings or when output needs to be in a specific encoding. + +#### Format + +``` +-f codepage # Set both input and output to the same codepage +-f i:codepage # Set input codepage only +-f o:codepage # Set output codepage only +-f i:codepage,o:codepage # Set input and output to different codepages +-f o:codepage,i:codepage # Same as above (order doesn't matter) +``` + +#### Common Code Pages + +| Code Page | Name | Description | +|-----------|------|-------------| +| 65001 | UTF-8 | Unicode (UTF-8) - default for most modern systems | +| 1200 | UTF-16LE | Unicode (UTF-16 Little-Endian) | +| 1201 | UTF-16BE | Unicode (UTF-16 Big-Endian) | +| 1252 | Windows-1252 | Western European (Windows) | +| 932 | Shift_JIS | Japanese | +| 936 | GBK | Chinese Simplified | +| 949 | EUC-KR | Korean | +| 950 | Big5 | Chinese Traditional | +| 437 | CP437 | OEM United States (DOS) | + +#### Examples + +**Run a script saved in Windows-1252 encoding:** +```bash +sqlcmd -S myserver -i legacy_script.sql -f 1252 +``` + +**Read UTF-16 input file and write UTF-8 output:** +```bash +sqlcmd -S myserver -i unicode_script.sql -o results.txt -f i:1200,o:65001 +``` + +**Process a Japanese Shift-JIS encoded script:** +```bash +sqlcmd -S myserver -i japanese_data.sql -f 932 +``` + +**Write output in Windows-1252 for legacy applications:** +```bash +sqlcmd -S myserver -Q "SELECT * FROM Products" -o report.txt -f o:1252 +``` + +**List all supported code pages:** +```bash +sqlcmd --list-codepages +``` + +#### Notes + +- When no `-f` flag is specified, sqlcmd auto-detects UTF-16 BOM (Byte Order Mark) in input files and falls back to UTF-8. +- UTF-8 input files with BOM are handled automatically. +- On Windows, additional codepages installed on the system are also available via the Windows API. +- Use `--list-codepages` to see all supported code pages with their names and descriptions. + ### Packages #### sqlcmd executable diff --git a/cmd/sqlcmd/sqlcmd.go b/cmd/sqlcmd/sqlcmd.go index 4fad0232..66b60665 100644 --- a/cmd/sqlcmd/sqlcmd.go +++ b/cmd/sqlcmd/sqlcmd.go @@ -83,7 +83,10 @@ type SQLCmdArguments struct { ChangePasswordAndExit string TraceFile string CodePage string - ListCodePages bool + // codePageSettings stores the parsed CodePageSettings after validation. + // This avoids parsing CodePage twice (in Validate and run). + codePageSettings *sqlcmd.CodePageSettings + ListCodePages bool // Keep Help at the end of the list Help bool } @@ -174,8 +177,10 @@ func (a *SQLCmdArguments) Validate(c *cobra.Command) (err error) { case a.ServerCertificate != "" && !encryptConnectionAllowsTLS(a.EncryptConnection): err = localizer.Errorf("The -J parameter requires encryption to be enabled (-N true, -N mandatory, or -N strict).") case a.CodePage != "": - if _, parseErr := sqlcmd.ParseCodePage(a.CodePage); parseErr != nil { + if codePageSettings, parseErr := sqlcmd.ParseCodePage(a.CodePage); parseErr != nil { err = localizer.Errorf(`'-f %s': %v`, a.CodePage, parseErr) + } else { + a.codePageSettings = codePageSettings } } } @@ -832,13 +837,9 @@ func run(vars *sqlcmd.Variables, args *SQLCmdArguments) (int, error) { defer s.StopCloseHandler() s.UnicodeOutputFile = args.UnicodeOutputFile - // Parse and apply codepage settings - if args.CodePage != "" { - codePageSettings, err := sqlcmd.ParseCodePage(args.CodePage) - if err != nil { - return 1, localizer.Errorf("Invalid code page: %v", err) - } - s.CodePage = codePageSettings + // Apply codepage settings (already parsed and validated in Validate) + if args.codePageSettings != nil { + s.CodePage = args.codePageSettings } if args.DisableCmd != nil { diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index 1466bbf4..30675492 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -32,8 +32,8 @@ type codepageEntry struct { var codepageRegistry = map[int]codepageEntry{ // Unicode 65001: {nil, "UTF-8", "Unicode (UTF-8)"}, - 1200: {unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, - 1201: {unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, + 1200: {unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), "UTF-16LE", "Unicode (UTF-16 Little-Endian)"}, + 1201: {unicode.UTF16(unicode.BigEndian, unicode.UseBOM), "UTF-16BE", "Unicode (UTF-16 Big-Endian)"}, // OEM/DOS codepages 437: {charmap.CodePage437, "CP437", "OEM United States"}, From 2e62e1047d1baae0b5dc8ad61288e34c308f54b9 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 17:47:18 -0600 Subject: [PATCH 10/13] Address Copilot review comments (round 7) - Add validation to error when -f arg is non-empty but no codepage parsed (e.g., ',' or whitespace-only input) - Add unit tests for comma-only, whitespace-only, and multiple-comma inputs - Fix misleading BOM comments to accurately describe BOMOverride behavior - Remove unused skipOnEncError field from test table --- pkg/sqlcmd/codepage.go | 6 ++++++ pkg/sqlcmd/codepage_test.go | 18 ++++++++++++++++++ pkg/sqlcmd/commands_test.go | 9 ++++----- pkg/sqlcmd/sqlcmd.go | 4 ++-- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pkg/sqlcmd/codepage.go b/pkg/sqlcmd/codepage.go index 30675492..fc44e5b5 100644 --- a/pkg/sqlcmd/codepage.go +++ b/pkg/sqlcmd/codepage.go @@ -155,6 +155,12 @@ func ParseCodePage(arg string) (*CodePageSettings, error) { } } + // If a non-empty argument was provided but no codepage was parsed, + // treat this as an error rather than silently disabling codepage handling. + if settings.InputCodePage == 0 && settings.OutputCodePage == 0 { + return nil, localizer.Errorf("invalid codepage: %s", arg) + } + // Validate codepages if settings.InputCodePage != 0 { if _, err := GetEncoding(settings.InputCodePage); err != nil { diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index 748c9983..b40e3ca1 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -92,6 +92,24 @@ func TestParseCodePage(t *testing.T) { wantErr: true, errContains: "codepage", // Error message varies by platform }, + { + name: "comma only produces no codepage", + arg: ",", + wantErr: true, + errContains: "invalid codepage", + }, + { + name: "whitespace only produces no codepage", + arg: " ", + wantErr: true, + errContains: "invalid codepage", + }, + { + name: "multiple commas produce no codepage", + arg: ",,,", + wantErr: true, + errContains: "invalid codepage", + }, { name: "Japanese Shift JIS", arg: "932", diff --git a/pkg/sqlcmd/commands_test.go b/pkg/sqlcmd/commands_test.go index 76612b77..dc28333c 100644 --- a/pkg/sqlcmd/commands_test.go +++ b/pkg/sqlcmd/commands_test.go @@ -461,11 +461,10 @@ func TestExitCommandAppendsParameterToCurrentBatch(t *testing.T) { func TestOutputCodePageCommand(t *testing.T) { tests := []struct { - name string - codepage int - expectedBytes []byte - inputText string - skipOnEncError bool + name string + codepage int + expectedBytes []byte + inputText string }{ { name: "UTF-8 output", diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index 221676e6..d6beec99 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -352,12 +352,12 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { reader = transform.NewReader(f, enc.NewDecoder()) } } else { - // UTF-8 codepage: still apply BOM stripping + // UTF-8 codepage: use BOMOverride to strip UTF-8 BOM and auto-detect UTF-16 BOMs, defaulting to UTF-8 otherwise utf8bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) reader = transform.NewReader(f, utf8bom) } } else { - // Default: auto-detect BOM for UTF-16, fallback to UTF-8 + // Default: auto-detect BOMs (UTF-8/UTF-16) and decode accordingly, falling back to UTF-8 when no BOM is present utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) reader = transform.NewReader(f, utf16bom) } From b96e228852538e6cbe63d2b7fb845d105de9f023 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 18:26:48 -0600 Subject: [PATCH 11/13] Fix codepage error message for locale-independent formatting Use strconv.Itoa instead of %d format verb to avoid locale-based number formatting that adds thousands separators (99,999 vs 99999). This ensures consistent error messages across all platforms. --- pkg/sqlcmd/codepage_other.go | 6 +++++- pkg/sqlcmd/codepage_windows.go | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pkg/sqlcmd/codepage_other.go b/pkg/sqlcmd/codepage_other.go index c27f78dc..2908778f 100644 --- a/pkg/sqlcmd/codepage_other.go +++ b/pkg/sqlcmd/codepage_other.go @@ -6,6 +6,8 @@ package sqlcmd import ( + "strconv" + "github.com/microsoft/go-sqlcmd/internal/localizer" "golang.org/x/text/encoding" ) @@ -15,5 +17,7 @@ import ( // The built-in codepageRegistry covers the most common codepages. // For additional codepages (e.g., Japanese EBCDIC), use Windows. func getSystemCodePageEncoding(codepage int) (encoding.Encoding, error) { - return nil, localizer.Errorf("codepage %d is not supported on this platform; additional codepages are available on Windows", codepage) + // Use %s with strconv.Itoa to avoid locale-based number formatting + // that would add thousands separators (e.g., "99,999" instead of "99999") + return nil, localizer.Errorf("unsupported codepage %s", strconv.Itoa(codepage)) } diff --git a/pkg/sqlcmd/codepage_windows.go b/pkg/sqlcmd/codepage_windows.go index e2394501..4d2c019e 100644 --- a/pkg/sqlcmd/codepage_windows.go +++ b/pkg/sqlcmd/codepage_windows.go @@ -7,6 +7,7 @@ package sqlcmd import ( "errors" + "strconv" "unicode/utf16" "unsafe" @@ -177,7 +178,9 @@ func isCodePageValid(codepage uint32) bool { func getSystemCodePageEncoding(codepage int) (encoding.Encoding, error) { cp := uint32(codepage) if !isCodePageValid(cp) { - return nil, localizer.Errorf("codepage %d is not installed on this system", codepage) + // Use %s with strconv.Itoa to avoid locale-based number formatting + // that would add thousands separators (e.g., "99,999" instead of "99999") + return nil, localizer.Errorf("unsupported codepage %s", strconv.Itoa(codepage)) } return &windowsCodePageEncoding{codepage: cp}, nil } From 4ddfd1af5ce9bc7a1e8d921e319be12c903f8050 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 18:37:23 -0600 Subject: [PATCH 12/13] Fix Windows codepage transformer to handle streaming correctly Address Copilot review: The windowsDecoder and windowsEncoder now properly handle the atEOF parameter and buffer incomplete sequences between Transform calls. This ensures correct behavior when transform.Reader/Writer splits multibyte sequences across chunks. Changes: - Add buffer fields to windowsDecoder and windowsEncoder structs - Use MB_ERR_INVALID_CHARS to detect incomplete sequences in decoder - Use utf8.Valid to detect incomplete UTF-8 sequences in encoder - Return transform.ErrShortSrc when more input is needed - Return error for incomplete sequences at EOF - Add TestWindowsEncodingStreaming test for streaming behavior --- pkg/sqlcmd/codepage_test.go | 72 ++++++++++++++ pkg/sqlcmd/codepage_windows.go | 168 +++++++++++++++++++++++++++++---- 2 files changed, 220 insertions(+), 20 deletions(-) diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index b40e3ca1..138a3520 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -8,6 +8,7 @@ import ( "testing" "github.com/stretchr/testify/assert" + "golang.org/x/text/transform" ) func TestParseCodePage(t *testing.T) { @@ -316,3 +317,74 @@ func TestGetEncodingWindowsFallback(t *testing.T) { assert.Error(t, err, "invalid codepage should fail on all platforms") assert.Contains(t, err.Error(), "codepage") } + +func TestWindowsEncodingStreaming(t *testing.T) { + // This test verifies that the Windows API fallback handles streaming correctly + // by properly buffering incomplete multibyte sequences + + // Japanese EBCDIC (20290) is a good test case as it's only available via Windows API + cp := 20290 // IBM EBCDIC Japanese Katakana Extended + + enc, err := GetEncoding(cp) + if err != nil { + t.Skip("Codepage 20290 not available on this platform") + } + + // Test decoder streaming with transform.Reader + t.Run("decoder streaming", func(t *testing.T) { + // Create a simple EBCDIC encoded string: "ABC" = 0xC1 0xC2 0xC3 + ebcdicData := []byte{0xC1, 0xC2, 0xC3} + + decoder := enc.NewDecoder() + + // Simulate streaming by processing one byte at a time + var result []byte + for i := 0; i < len(ebcdicData); i++ { + decoder.Reset() // Reset between chunks for clean state + dst := make([]byte, 32) + nDst, _, err := decoder.Transform(dst, ebcdicData[i:i+1], i == len(ebcdicData)-1) + if err != nil && err != transform.ErrShortSrc { + t.Fatalf("Transform failed at byte %d: %v", i, err) + } + result = append(result, dst[:nDst]...) + } + assert.Equal(t, "ABC", string(result), "streaming decode should produce 'ABC'") + }) + + // Test encoder streaming + t.Run("encoder streaming", func(t *testing.T) { + // Test encoding "ABC" one character at a time + input := "ABC" + encoder := enc.NewEncoder() + + var result []byte + for i := 0; i < len(input); i++ { + encoder.Reset() // Reset between chunks for clean state + dst := make([]byte, 32) + nDst, _, err := encoder.Transform(dst, []byte(input[i:i+1]), i == len(input)-1) + if err != nil && err != transform.ErrShortSrc { + t.Fatalf("Transform failed at char %d: %v", i, err) + } + result = append(result, dst[:nDst]...) + } + expected := []byte{0xC1, 0xC2, 0xC3} // "ABC" in EBCDIC + assert.Equal(t, expected, result, "streaming encode should produce EBCDIC ABC") + }) + + // Test encoder handles incomplete UTF-8 correctly + t.Run("encoder incomplete UTF-8", func(t *testing.T) { + encoder := enc.NewEncoder() + dst := make([]byte, 32) + + // Send first byte of a 2-byte UTF-8 sequence (é = 0xC3 0xA9) + incompleteUTF8 := []byte{0xC3} // First byte of é + _, _, err := encoder.Transform(dst, incompleteUTF8, false) + // Should return ErrShortSrc because the sequence is incomplete + assert.Equal(t, transform.ErrShortSrc, err, "incomplete UTF-8 should return ErrShortSrc when not at EOF") + + // At EOF, incomplete sequence should be an error + encoder.Reset() + _, _, err = encoder.Transform(dst, incompleteUTF8, true) + assert.Error(t, err, "incomplete UTF-8 at EOF should return error") + }) +} diff --git a/pkg/sqlcmd/codepage_windows.go b/pkg/sqlcmd/codepage_windows.go index 4d2c019e..3fe4ef49 100644 --- a/pkg/sqlcmd/codepage_windows.go +++ b/pkg/sqlcmd/codepage_windows.go @@ -9,6 +9,7 @@ import ( "errors" "strconv" "unicode/utf16" + "unicode/utf8" "unsafe" "github.com/microsoft/go-sqlcmd/internal/localizer" @@ -17,6 +18,15 @@ import ( "golang.org/x/text/transform" ) +const ( + // MB_ERR_INVALID_CHARS causes MultiByteToWideChar to fail if it encounters + // an invalid character in the source string (including incomplete sequences) + mbErrInvalidChars = 0x00000008 + // Maximum bytes that might form a single character in any Windows codepage + // (most DBCS codepages use 2 bytes, but we use 4 for safety) + maxMultibyteCharLen = 4 +) + var ( kernel32 = windows.NewLazySystemDLL("kernel32.dll") procMultiByteToWideChar = kernel32.NewProc("MultiByteToWideChar") @@ -36,43 +46,113 @@ func (e *windowsCodePageEncoding) NewEncoder() *encoding.Encoder { return &encoding.Encoder{Transformer: &windowsEncoder{codepage: e.codepage}} } -// windowsDecoder converts from a Windows codepage to UTF-8 +// windowsDecoder converts from a Windows codepage to UTF-8. +// It buffers incomplete multibyte sequences between Transform calls. type windowsDecoder struct { codepage uint32 + buf [maxMultibyteCharLen]byte // buffer for incomplete sequences + bufLen int // number of bytes in buffer } -func (d *windowsDecoder) Reset() {} +func (d *windowsDecoder) Reset() { + d.bufLen = 0 +} func (d *windowsDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { - if len(src) == 0 { + // Prepend any buffered bytes from previous call + var input []byte + if d.bufLen > 0 { + input = make([]byte, d.bufLen+len(src)) + copy(input, d.buf[:d.bufLen]) + copy(input[d.bufLen:], src) + } else { + input = src + } + + if len(input) == 0 { return 0, 0, nil } - // First call to get required buffer size for wide chars + // Try to convert with MB_ERR_INVALID_CHARS to detect incomplete sequences n, _, errno := procMultiByteToWideChar.Call( uintptr(d.codepage), - 0, - uintptr(unsafe.Pointer(&src[0])), - uintptr(len(src)), + mbErrInvalidChars, + uintptr(unsafe.Pointer(&input[0])), + uintptr(len(input)), 0, 0, ) - if n == 0 { + + // If conversion failed, it might be due to incomplete trailing sequence + if n == 0 && errno == windows.ERROR_NO_UNICODE_TRANSLATION { + if atEOF { + // At EOF with incomplete sequence - this is an error + d.bufLen = 0 + return 0, len(src), errors.New("incomplete multibyte sequence at end of input") + } + + // Not at EOF - try removing bytes from the end until conversion succeeds + // This finds the incomplete trailing sequence + for trimLen := 1; trimLen <= len(input) && trimLen <= maxMultibyteCharLen; trimLen++ { + tryLen := len(input) - trimLen + if tryLen <= 0 { + // Need more input - buffer what we have + if len(input) <= maxMultibyteCharLen { + copy(d.buf[:], input) + d.bufLen = len(input) + return 0, len(src), transform.ErrShortSrc + } + break + } + + n, _, errno = procMultiByteToWideChar.Call( + uintptr(d.codepage), + mbErrInvalidChars, + uintptr(unsafe.Pointer(&input[0])), + uintptr(tryLen), + 0, + 0, + ) + if n > 0 || errno != windows.ERROR_NO_UNICODE_TRANSLATION { + // Found a valid prefix - buffer the trailing bytes + trailingBytes := input[tryLen:] + copy(d.buf[:], trailingBytes) + d.bufLen = len(trailingBytes) + input = input[:tryLen] + break + } + } + + // If still failing, buffer everything and wait for more + if n == 0 { + if len(input) <= maxMultibyteCharLen { + copy(d.buf[:], input) + d.bufLen = len(input) + return 0, len(src), transform.ErrShortSrc + } + // Input is larger than max char length but still invalid - real error + d.bufLen = 0 + return 0, len(src), errors.New("invalid multibyte sequence") + } + } else if n == 0 { if errno != windows.ERROR_SUCCESS { + d.bufLen = 0 return 0, 0, errno } + d.bufLen = 0 return 0, 0, errors.New("MultiByteToWideChar failed") + } else { + // Success - clear buffer since we'll consume all input + d.bufLen = 0 } - // Allocate wide char buffer + // Allocate wide char buffer and do the actual conversion wideChars := make([]uint16, n) - - // Convert to wide chars n, _, errno = procMultiByteToWideChar.Call( uintptr(d.codepage), - 0, - uintptr(unsafe.Pointer(&src[0])), - uintptr(len(src)), + 0, // Don't use MB_ERR_INVALID_CHARS here - we already validated + uintptr(unsafe.Pointer(&input[0])), + uintptr(len(input)), uintptr(unsafe.Pointer(&wideChars[0])), uintptr(len(wideChars)), ) @@ -92,23 +172,71 @@ func (d *windowsDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, } copy(dst, utf8Bytes) - return len(utf8Bytes), len(src), nil + return len(utf8Bytes), len(src), err } -// windowsEncoder converts from UTF-8 to a Windows codepage +// windowsEncoder converts from UTF-8 to a Windows codepage. +// It buffers incomplete UTF-8 sequences between Transform calls. type windowsEncoder struct { codepage uint32 + buf [utf8.UTFMax]byte // buffer for incomplete UTF-8 sequences + bufLen int // number of bytes in buffer } -func (e *windowsEncoder) Reset() {} +func (e *windowsEncoder) Reset() { + e.bufLen = 0 +} func (e *windowsEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { - if len(src) == 0 { + // Prepend any buffered bytes from previous call + var input []byte + if e.bufLen > 0 { + input = make([]byte, e.bufLen+len(src)) + copy(input, e.buf[:e.bufLen]) + copy(input[e.bufLen:], src) + } else { + input = src + } + + if len(input) == 0 { return 0, 0, nil } + // Find the last complete UTF-8 sequence + validLen := len(input) + for validLen > 0 && !utf8.Valid(input[:validLen]) { + validLen-- + } + + // Check for incomplete trailing sequence + if validLen < len(input) { + trailingBytes := input[validLen:] + if atEOF { + // At EOF with incomplete UTF-8 - this is an error + e.bufLen = 0 + return 0, len(src), errors.New("incomplete UTF-8 sequence at end of input") + } + // Buffer the incomplete trailing bytes for next call + if len(trailingBytes) <= utf8.UTFMax { + copy(e.buf[:], trailingBytes) + e.bufLen = len(trailingBytes) + } else { + // Shouldn't happen with valid partial UTF-8, but handle it + e.bufLen = 0 + return 0, len(src), errors.New("invalid UTF-8 sequence") + } + input = input[:validLen] + } else { + e.bufLen = 0 + } + + if len(input) == 0 { + // Only incomplete sequence - need more input + return 0, len(src), transform.ErrShortSrc + } + // Convert UTF-8 to UTF-16 - runes := []rune(string(src)) + runes := []rune(string(input)) wideChars := utf16.Encode(runes) if len(wideChars) == 0 { @@ -155,7 +283,7 @@ func (e *windowsEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, return 0, 0, errors.New("WideCharToMultiByte failed") } - return int(n), len(src), nil + return int(n), len(src), err } // isCodePageValid checks if a codepage is valid/installed on Windows From 9c471b1b0b462eba9830301d563cbc4e158a63d4 Mon Sep 17 00:00:00 2001 From: David Levy Date: Sun, 25 Jan 2026 18:51:57 -0600 Subject: [PATCH 13/13] Address Copilot review: clarify BOM handling documentation - Fix getSystemCodePageEncoding comment to describe error return behavior - Clarify test comment in TestWindowsEncodingStreaming to match actual coverage - Improve comments in IncludeFile explaining BOMOverride behavior - Update README notes to clarify that --list-codepages shows built-in set only and that Windows may have additional codepages via OS API --- README.md | 6 +++--- pkg/sqlcmd/codepage_test.go | 5 +++-- pkg/sqlcmd/codepage_windows.go | 3 ++- pkg/sqlcmd/sqlcmd.go | 7 +++++-- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index ec53271c..bfefb307 100644 --- a/README.md +++ b/README.md @@ -295,10 +295,10 @@ sqlcmd --list-codepages #### Notes -- When no `-f` flag is specified, sqlcmd auto-detects UTF-16 BOM (Byte Order Mark) in input files and falls back to UTF-8. +- When no `-f` flag is specified, sqlcmd auto-detects UTF-8/UTF-16LE/UTF-16BE BOM (Byte Order Mark) in input files and switches to the appropriate decoder. If no BOM is present, UTF-8 is assumed. - UTF-8 input files with BOM are handled automatically. -- On Windows, additional codepages installed on the system are also available via the Windows API. -- Use `--list-codepages` to see all supported code pages with their names and descriptions. +- On Windows, additional codepages installed on the system are available via the Windows API, even if not shown by `--list-codepages`. +- Use `--list-codepages` to see the built-in code pages with their names and descriptions. ### Packages diff --git a/pkg/sqlcmd/codepage_test.go b/pkg/sqlcmd/codepage_test.go index 138a3520..6844b0af 100644 --- a/pkg/sqlcmd/codepage_test.go +++ b/pkg/sqlcmd/codepage_test.go @@ -319,8 +319,9 @@ func TestGetEncodingWindowsFallback(t *testing.T) { } func TestWindowsEncodingStreaming(t *testing.T) { - // This test verifies that the Windows API fallback handles streaming correctly - // by properly buffering incomplete multibyte sequences + // This test exercises that the Windows API fallback encoding can be used in + // streaming-like scenarios and that it handles single-byte data and + // incomplete UTF-8 input correctly. // Japanese EBCDIC (20290) is a good test case as it's only available via Windows API cp := 20290 // IBM EBCDIC Japanese Katakana Extended diff --git a/pkg/sqlcmd/codepage_windows.go b/pkg/sqlcmd/codepage_windows.go index 3fe4ef49..3eb1a746 100644 --- a/pkg/sqlcmd/codepage_windows.go +++ b/pkg/sqlcmd/codepage_windows.go @@ -302,7 +302,8 @@ func isCodePageValid(codepage uint32) bool { } // getSystemCodePageEncoding returns an encoding using Windows API for codepages -// not in our built-in registry. Returns nil if the codepage is not available. +// not in our built-in registry. If the codepage is not available, it returns +// a nil encoding and a non-nil error. func getSystemCodePageEncoding(codepage int) (encoding.Encoding, error) { cp := uint32(codepage) if !isCodePageValid(cp) { diff --git a/pkg/sqlcmd/sqlcmd.go b/pkg/sqlcmd/sqlcmd.go index d6beec99..a76d2ac5 100644 --- a/pkg/sqlcmd/sqlcmd.go +++ b/pkg/sqlcmd/sqlcmd.go @@ -352,12 +352,15 @@ func (s *Sqlcmd) IncludeFile(path string, processAll bool) error { reader = transform.NewReader(f, enc.NewDecoder()) } } else { - // UTF-8 codepage: use BOMOverride to strip UTF-8 BOM and auto-detect UTF-16 BOMs, defaulting to UTF-8 otherwise + // UTF-8 codepage (65001): BOMOverride detects UTF-8/UTF-16LE/UTF-16BE BOMs and + // switches decoder accordingly, falling back to UTF-8 when no BOM is present utf8bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) reader = transform.NewReader(f, utf8bom) } } else { - // Default: auto-detect BOMs (UTF-8/UTF-16) and decode accordingly, falling back to UTF-8 when no BOM is present + // Default (no -f flag): BOMOverride detects UTF-8/UTF-16LE/UTF-16BE BOMs at + // the start of input and switches decoder accordingly; falls back to UTF-8 + // when no BOM is present (see golang.org/x/text/encoding/unicode.BOMOverride) utf16bom := unicode.BOMOverride(unicode.UTF8.NewDecoder()) reader = transform.NewReader(f, utf16bom) }