Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions packages/super-editor/src/core/DocxZipper.js
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ class DocxZipper {
const beginningString = '<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">';
let updatedContentTypesXml = contentTypesXml.replace(beginningString, `${beginningString}${typesString}`);

// Remove Override elements for comment parts that no longer exist
// Remove Override elements for parts that no longer exist
for (const partName of staleOverridePartNames) {
const escapedPartName = partName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const overrideRegex = new RegExp(`\\s*<Override[^>]*PartName="${escapedPartName}"[^>]*/>`, 'g');
Expand Down Expand Up @@ -388,8 +388,11 @@ class DocxZipper {
const unzippedOriginalDocx = await this.unzip(originalDocxFile);
const filePromises = [];
unzippedOriginalDocx.forEach((relativePath, zipEntry) => {
const promise = zipEntry.async('string').then((content) => {
unzippedOriginalDocx.file(zipEntry.name, content);
// Read as raw bytes to handle non-UTF-8 encodings (e.g. UTF-16 LE
// customXml parts). XML/rels files are decoded to valid UTF-8 strings;
// other entries are kept as raw bytes.
const promise = zipEntry.async('uint8array').then((u8) => {
unzippedOriginalDocx.file(zipEntry.name, isXmlLike(zipEntry.name) ? ensureXmlString(u8) : u8);
});
filePromises.push(promise);
});
Expand Down
87 changes: 86 additions & 1 deletion packages/super-editor/src/core/DocxZipper.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,92 @@ describe('DocxZipper - UTF-16 XML handling', () => {
expect(item2.content).toContain('<?xml'); // prolog present
expect(item2.content).toContain('<properties'); // real tag (no NULs interleaved)
expect(item2.content).not.toMatch(/\u0000/); // no embedded NULs
expect(item2.content.toLowerCase()).toContain('encoding="utf-16"');
// ensureXmlString rewrites the stale encoding declaration to UTF-8
expect(item2.content).toContain('encoding="UTF-8"');
expect(item2.content.toLowerCase()).not.toContain('encoding="utf-16"');
});

it('round-trips UTF-16LE XML through exportFromOriginalFile without corruption', async () => {
const zip = new JSZip();

const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>`;
zip.file('[Content_Types].xml', contentTypes);
zip.file(
'word/document.xml',
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
);

const customXmlUtf16 = `<?xml version="1.0" encoding="utf-16"?>
<properties xmlns="http://www.imanage.com/work/xmlschema">
<documentid>DOC!123.1</documentid>
</properties>`;
zip.file('customXml/item1.xml', utf16leWithBOM(customXmlUtf16));

const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' });

const result = await zipper.updateZip({
docx: [],
updatedDocs: {
'word/document.xml': '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
},
originalDocxFile,
media: {},
fonts: {},
isHeadless: true,
});

const readBack = await new JSZip().loadAsync(result);
const customXml = await readBack.file('customXml/item1.xml').async('string');

expect(customXml).toContain('<properties');
expect(customXml).toContain('DOC!123.1');
expect(customXml).not.toMatch(/\u0000/); // no NUL bytes from garbled UTF-16
expect(customXml).toContain('encoding="UTF-8"');
expect(customXml.toLowerCase()).not.toContain('encoding="utf-16"');
});

it('preserves binary entries unchanged through exportFromOriginalFile', async () => {
const zip = new JSZip();

const contentTypes = `<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Default Extension="png" ContentType="image/png"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>`;
zip.file('[Content_Types].xml', contentTypes);
zip.file(
'word/document.xml',
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
);

// Arbitrary binary bytes (fake PNG header + random data)
const binaryData = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0xde, 0xad, 0xbe, 0xef]);
zip.file('word/media/image1.png', binaryData);

const originalDocxFile = await zip.generateAsync({ type: 'nodebuffer' });

const result = await zipper.updateZip({
docx: [],
updatedDocs: {
'word/document.xml': '<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"/>',
},
originalDocxFile,
media: {},
fonts: {},
isHeadless: true,
});

const readBack = await new JSZip().loadAsync(result);
const imageBytes = await readBack.file('word/media/image1.png').async('uint8array');

expect(imageBytes).toEqual(binaryData);
});
});

Expand Down
11 changes: 10 additions & 1 deletion packages/super-editor/src/core/encoding-helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,14 @@ export function ensureXmlString(content) {

const enc = sniffEncoding(u8);
let xml = new TextDecoder(enc).decode(u8);
return stripBOM(xml);
xml = stripBOM(xml);

// After converting from non-UTF-8 to a JS string, the XML declaration's
// encoding attribute is stale (e.g. encoding="utf-16"). The output will
// be serialized as UTF-8, so update or remove the declaration to match.
if (enc !== 'utf-8') {
xml = xml.replace(/(<\?xml\b[^?]*?)\bencoding\s*=\s*["'][^"']*["']/i, '$1encoding="UTF-8"');
}

return xml;
}
16 changes: 12 additions & 4 deletions packages/super-editor/src/core/encoding-helpers.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -92,22 +92,30 @@ describe('ensureXmlString', () => {
expect(out).toContain('héllo');
});

it('decodes UTF-16LE with BOM bytes', () => {
it('decodes UTF-16LE with BOM bytes and rewrites encoding to UTF-8', () => {
const u8 = utf16leWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
const out = ensureXmlString(u8);
expect(out.toLowerCase()).toContain('encoding="utf-16"');
expect(out).toContain('encoding="UTF-8"');
expect(out).not.toContain('encoding="utf-16"');
expect(out).toContain('<props>');
expect(out).not.toMatch(/\u0000/);
});

it('decodes UTF-16BE with BOM bytes', () => {
it('decodes UTF-16BE with BOM bytes and rewrites encoding to UTF-8', () => {
const u8 = utf16beWithBOM('<?xml version="1.0" encoding="utf-16"?><props><k>v</k></props>');
const out = ensureXmlString(u8);
expect(out.toLowerCase()).toContain('encoding="utf-16"');
expect(out).toContain('encoding="UTF-8"');
expect(out).not.toContain('encoding="utf-16"');
expect(out).toContain('<props>');
expect(out).not.toMatch(/\u0000/);
});

it('does not rewrite encoding for UTF-8 input', () => {
const u8 = new TextEncoder().encode('<?xml version="1.0" encoding="UTF-8"?><root/>');
const out = ensureXmlString(u8);
expect(out).toContain('encoding="UTF-8"');
});

it('decodes UTF-16 (no BOM) via heuristic', () => {
const u8 = noBOMUtf16leBytes('<?xml version="1.0"?><root>NOBOM</root>');
const out = ensureXmlString(u8);
Expand Down
Loading