diff --git a/packages/super-editor/src/core/DocxZipper.js b/packages/super-editor/src/core/DocxZipper.js index 0766cf17d..92c003ebb 100644 --- a/packages/super-editor/src/core/DocxZipper.js +++ b/packages/super-editor/src/core/DocxZipper.js @@ -256,7 +256,7 @@ class DocxZipper { const beginningString = ''; let updatedContentTypesXml = contentTypesXml.replace(beginningString, `${beginningString}${typesString}`); - // Remove Override elements for comment parts that no longer exist + // Remove Override elements for parts that no longer exist for (const partName of staleOverridePartNames) { const escapedPartName = partName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); const overrideRegex = new RegExp(`\\s*]*PartName="${escapedPartName}"[^>]*/>`, 'g'); @@ -388,8 +388,11 @@ class DocxZipper { const unzippedOriginalDocx = await this.unzip(originalDocxFile); const filePromises = []; unzippedOriginalDocx.forEach((relativePath, zipEntry) => { - const promise = zipEntry.async('string').then((content) => { - unzippedOriginalDocx.file(zipEntry.name, content); + // Read as raw bytes to handle non-UTF-8 encodings (e.g. UTF-16 LE + // customXml parts). XML/rels files are decoded to valid UTF-8 strings; + // other entries are kept as raw bytes. + const promise = zipEntry.async('uint8array').then((u8) => { + unzippedOriginalDocx.file(zipEntry.name, isXmlLike(zipEntry.name) ? ensureXmlString(u8) : u8); }); filePromises.push(promise); }); diff --git a/packages/super-editor/src/core/DocxZipper.test.js b/packages/super-editor/src/core/DocxZipper.test.js index 9541f6ff5..178c1e88b 100644 --- a/packages/super-editor/src/core/DocxZipper.test.js +++ b/packages/super-editor/src/core/DocxZipper.test.js @@ -105,7 +105,92 @@ describe('DocxZipper - UTF-16 XML handling', () => { expect(item2.content).toContain(' { + const zip = new JSZip(); + + const contentTypes = ` + + + + + `; + zip.file('[Content_Types].xml', contentTypes); + zip.file( + 'word/document.xml', + '', + ); + + const customXmlUtf16 = ` + + DOC!123.1 +`; + zip.file('customXml/item1.xml', utf16leWithBOM(customXmlUtf16)); + + const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' }); + + const result = await zipper.updateZip({ + docx: [], + updatedDocs: { + 'word/document.xml': '', + }, + originalDocxFile, + media: {}, + fonts: {}, + isHeadless: true, + }); + + const readBack = await new JSZip().loadAsync(result); + const customXml = await readBack.file('customXml/item1.xml').async('string'); + + expect(customXml).toContain(' { + const zip = new JSZip(); + + const contentTypes = ` + + + + + + `; + zip.file('[Content_Types].xml', contentTypes); + zip.file( + 'word/document.xml', + '', + ); + + // Arbitrary binary bytes (fake PNG header + random data) + const binaryData = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0xde, 0xad, 0xbe, 0xef]); + zip.file('word/media/image1.png', binaryData); + + const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' }); + + const result = await zipper.updateZip({ + docx: [], + updatedDocs: { + 'word/document.xml': '', + }, + originalDocxFile, + media: {}, + fonts: {}, + isHeadless: true, + }); + + const readBack = await new JSZip().loadAsync(result); + const imageBytes = await readBack.file('word/media/image1.png').async('uint8array'); + + expect(imageBytes).toEqual(binaryData); }); }); diff --git a/packages/super-editor/src/core/encoding-helpers.js b/packages/super-editor/src/core/encoding-helpers.js index f04eeecad..aad7694f2 100644 --- a/packages/super-editor/src/core/encoding-helpers.js +++ b/packages/super-editor/src/core/encoding-helpers.js @@ -76,5 +76,14 @@ export function ensureXmlString(content) { const enc = sniffEncoding(u8); let xml = new TextDecoder(enc).decode(u8); - return stripBOM(xml); + xml = stripBOM(xml); + + // After converting from non-UTF-8 to a JS string, the XML declaration's + // encoding attribute is stale (e.g. encoding="utf-16"). The output will + // be serialized as UTF-8, so update or remove the declaration to match. + if (enc !== 'utf-8') { + xml = xml.replace(/(<\?xml\b[^?]*?)\bencoding\s*=\s*["'][^"']*["']/i, '$1encoding="UTF-8"'); + } + + return xml; } diff --git a/packages/super-editor/src/core/encoding-helpers.test.js b/packages/super-editor/src/core/encoding-helpers.test.js index 7a0a154bf..6f8d15641 100644 --- a/packages/super-editor/src/core/encoding-helpers.test.js +++ b/packages/super-editor/src/core/encoding-helpers.test.js @@ -92,22 +92,30 @@ describe('ensureXmlString', () => { expect(out).toContain('héllo'); }); - it('decodes UTF-16LE with BOM bytes', () => { + it('decodes UTF-16LE with BOM bytes and rewrites encoding to UTF-8', () => { const u8 = utf16leWithBOM('v'); const out = ensureXmlString(u8); - expect(out.toLowerCase()).toContain('encoding="utf-16"'); + expect(out).toContain('encoding="UTF-8"'); + expect(out).not.toContain('encoding="utf-16"'); expect(out).toContain(''); expect(out).not.toMatch(/\u0000/); }); - it('decodes UTF-16BE with BOM bytes', () => { + it('decodes UTF-16BE with BOM bytes and rewrites encoding to UTF-8', () => { const u8 = utf16beWithBOM('v'); const out = ensureXmlString(u8); - expect(out.toLowerCase()).toContain('encoding="utf-16"'); + expect(out).toContain('encoding="UTF-8"'); + expect(out).not.toContain('encoding="utf-16"'); expect(out).toContain(''); expect(out).not.toMatch(/\u0000/); }); + it('does not rewrite encoding for UTF-8 input', () => { + const u8 = new TextEncoder().encode(''); + const out = ensureXmlString(u8); + expect(out).toContain('encoding="UTF-8"'); + }); + it('decodes UTF-16 (no BOM) via heuristic', () => { const u8 = noBOMUtf16leBytes('NOBOM'); const out = ensureXmlString(u8);