superdoc-dev · caio-pizzol · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/packages/super-editor/src/core/DocxZipper.js b/packages/super-editor/src/core/DocxZipper.js
@@ -256,7 +256,7 @@ class DocxZipper {
     const beginningString = '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">';
     let updatedContentTypesXml = contentTypesXml.replace(beginningString, `${beginningString}${typesString}`);
 
-    // Remove Override elements for comment parts that no longer exist
+    // Remove Override elements for parts that no longer exist
     for (const partName of staleOverridePartNames) {
       const escapedPartName = partName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
       const overrideRegex = new RegExp(`\\s*<Override[^>]*PartName="${escapedPartName}"[^>]*/>`, 'g');
@@ -388,8 +388,11 @@ class DocxZipper {
     const unzippedOriginalDocx = await this.unzip(originalDocxFile);
     const filePromises = [];
     unzippedOriginalDocx.forEach((relativePath, zipEntry) => {
-      const promise = zipEntry.async('string').then((content) => {
-        unzippedOriginalDocx.file(zipEntry.name, content);
+      // Read as raw bytes to handle non-UTF-8 encodings (e.g. UTF-16 LE
+      // customXml parts). XML/rels files are decoded to valid UTF-8 strings;
+      // other entries are kept as raw bytes.
+      const promise = zipEntry.async('uint8array').then((u8) => {
+        unzippedOriginalDocx.file(zipEntry.name, isXmlLike(zipEntry.name) ? ensureXmlString(u8) : u8);
       });
       filePromises.push(promise);
     });

diff --git a/packages/super-editor/src/core/DocxZipper.test.js b/packages/super-editor/src/core/DocxZipper.test.js
@@ -105,7 +105,92 @@ describe('DocxZipper - UTF-16 XML handling', () => {
     expect(item2.content).toContain('<?xml'); // prolog present
     expect(item2.content).toContain('<properties'); // real tag (no NULs interleaved)
     expect(item2.content).not.toMatch(/\u0000/); // no embedded NULs
-    expect(item2.content.toLowerCase()).toContain('encoding="utf-16"');
+    // ensureXmlString rewrites the stale encoding declaration to UTF-8
+    expect(item2.content).toContain('encoding="UTF-8"');
+    expect(item2.content.toLowerCase()).not.toContain('encoding="utf-16"');
+  });
+
+  it('round-trips UTF-16LE XML through exportFromOriginalFile without corruption', async () => {
+    const zip = new JSZip();
+
+    const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
+      <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+        <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+        <Default Extension="xml" ContentType="application/xml"/>
+        <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+      </Types>`;
+    zip.file('[Content_Types].xml', contentTypes);
+    zip.file(
+      'word/document.xml',
+      '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
+    );
+
+    const customXmlUtf16 = `<?xml version="1.0" encoding="utf-16"?>
+<properties xmlns="http://www.imanage.com/work/xmlschema">
+  <documentid>DOC!123.1</documentid>
+</properties>`;
+    zip.file('customXml/item1.xml', utf16leWithBOM(customXmlUtf16));
+
+    const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' });
+
+    const result = await zipper.updateZip({
+      docx: [],
+      updatedDocs: {
+        'word/document.xml': '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
+      },
+      originalDocxFile,
+      media: {},
+      fonts: {},
+      isHeadless: true,
+    });
+
+    const readBack = await new JSZip().loadAsync(result);
+    const customXml = await readBack.file('customXml/item1.xml').async('string');
+
+    expect(customXml).toContain('<properties');
+    expect(customXml).toContain('DOC!123.1');
+    expect(customXml).not.toMatch(/\u0000/); // no NUL bytes from garbled UTF-16
+    expect(customXml).toContain('encoding="UTF-8"');
+    expect(customXml.toLowerCase()).not.toContain('encoding="utf-16"');
+  });
+
+  it('preserves binary entries unchanged through exportFromOriginalFile', async () => {
+    const zip = new JSZip();
+
+    const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
+      <Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+        <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+        <Default Extension="xml" ContentType="application/xml"/>
+        <Default Extension="png" ContentType="image/png"/>
+        <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+      </Types>`;
+    zip.file('[Content_Types].xml', contentTypes);
+    zip.file(
+      'word/document.xml',
+      '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
+    );
+
+    // Arbitrary binary bytes (fake PNG header + random data)
+    const binaryData = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0xde, 0xad, 0xbe, 0xef]);
+    zip.file('word/media/image1.png', binaryData);
+
+    const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' });
+
+    const result = await zipper.updateZip({
+      docx: [],
+      updatedDocs: {
+        'word/document.xml': '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
+      },
+      originalDocxFile,
+      media: {},
+      fonts: {},
+      isHeadless: true,
+    });
+
+    const readBack = await new JSZip().loadAsync(result);
+    const imageBytes = await readBack.file('word/media/image1.png').async('uint8array');
+
+    expect(imageBytes).toEqual(binaryData);
   });
 });
 

diff --git a/packages/super-editor/src/core/encoding-helpers.js b/packages/super-editor/src/core/encoding-helpers.js
@@ -76,5 +76,14 @@ export function ensureXmlString(content) {
 
   const enc = sniffEncoding(u8);
   let xml = new TextDecoder(enc).decode(u8);
-  return stripBOM(xml);
+  xml = stripBOM(xml);
+
+  // After converting from non-UTF-8 to a JS string, the XML declaration's
+  // encoding attribute is stale (e.g. encoding="utf-16"). The output will
+  // be serialized as UTF-8, so update or remove the declaration to match.
+  if (enc !== 'utf-8') {
+    xml = xml.replace(/(<\?xml\b[^?]*?)\bencoding\s*=\s*["'][^"']*["']/i, '$1encoding="UTF-8"');
+  }
+
+  return xml;
 }
diff --git a/packages/super-editor/src/core/encoding-helpers.test.js b/packages/super-editor/src/core/encoding-helpers.test.js
@@ -92,22 +92,30 @@ describe('ensureXmlString', () => {
     expect(out).toContain('héllo');
   });
 
-  it('decodes UTF-16LE with BOM bytes', () => {
+  it('decodes UTF-16LE with BOM bytes and rewrites encoding to UTF-8', () => {
     const u8 = utf16leWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
     const out = ensureXmlString(u8);
-    expect(out.toLowerCase()).toContain('encoding="utf-16"');
+    expect(out).toContain('encoding="UTF-8"');
+    expect(out).not.toContain('encoding="utf-16"');
     expect(out).toContain('<props>');
     expect(out).not.toMatch(/\u0000/);
   });
 
-  it('decodes UTF-16BE with BOM bytes', () => {
+  it('decodes UTF-16BE with BOM bytes and rewrites encoding to UTF-8', () => {
     const u8 = utf16beWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
     const out = ensureXmlString(u8);
-    expect(out.toLowerCase()).toContain('encoding="utf-16"');
+    expect(out).toContain('encoding="UTF-8"');
+    expect(out).not.toContain('encoding="utf-16"');
     expect(out).toContain('<props>');
     expect(out).not.toMatch(/\u0000/);
   });
 
+  it('does not rewrite encoding for UTF-8 input', () => {
+    const u8 = new TextEncoder().encode('<?xml version="1.0" encoding="UTF-8"?><root/>');
+    const out = ensureXmlString(u8);
+    expect(out).toContain('encoding="UTF-8"');
+  });
+
   it('decodes UTF-16 (no BOM) via heuristic', () => {
     const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root>NOBOM</root>');
     const out = ensureXmlString(u8);