Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ public class DaTests
[InlineData("DA264-InvalidRunLevelRepeat.docx", "DA-Data.xml", true)]
[InlineData("DA265-RunLevelRepeatWithWhiteSpaceBefore.docx", "DA-Data.xml", false)]
[InlineData("DA266-RunLevelRepeat-NoData.docx", "DA-Data.xml", true)]
[InlineData("DA268-Block-Conditional-In-Table-Cell.docx", "DA268-data.xml", false)]
public void DA101(string name, string data, bool err)
{
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
Expand Down Expand Up @@ -156,6 +157,24 @@ public void DA259(string name, string data, bool err)
Assert.Equal(4, brCount);
}

[Fact]
public void DA240()
{
string name = "DA240-Whitespace.docx";
DA101(name, "DA240-Whitespace.xml", false);
var assembledDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".docx", "-processed-by-DocumentAssembler.docx")));
WmlDocument afterAssembling = new WmlDocument(assembledDocx.FullName);

// when elements are inserted that begin or end with white space, make sure white space is preserved
string firstParaTextIncorrect = afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First().Value;
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextIncorrect);
// warning: XElement.Value returns the string resulting from direct concatenation of all W.t elements. This is fast but ignores
// proper handling of xml:space="preserve" attributes, which Word honors when rendering content. Below we also check
// the result of UnicodeMapper.RunToString, which has been enhanced to take xml:space="preserve" into account.
string firstParaTextCorrect = InnerText(afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First());
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextCorrect);
}

[Theory]
[InlineData("DA024-TrackedRevisions.docx", "DA-Data.xml")]
public void DA102_Throws(string name, string data)
Expand Down Expand Up @@ -487,6 +506,15 @@ private static string GetDocumentText(WmlDocument document)
private const string WidePngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAACuUlEQVR4nO3UMQ7CQBAEwT3EvxEvXz/BZKalqniCifrs7gAUvGfmnO/TNwBu7H5edxuAfyFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFFzMzu2y8A5u4PQZkIj89BEMEAAAAASUVORK5CYII=";
private const string TallPngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAMgAAAGQCAIAAABkkLjnAAAEF0lEQVR4nO3S0QkCURAEwX1i3mLke0lcI3hVAQzz0Wd3B+72npnzPbfv8mT72devP/CfhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkzu42y8yTXVnDDBu1Y983AAAAAElFTkSuQmCC";
private const string TruncatedGifBase64 = "R0lGODlhyABQAA==";

private static string InnerText(XContainer e)
{
return e.Descendants(W.r)
.Where(r => r.Parent.Name != W.del)
.Select(UnicodeMapper.RunToString)
.StringConcatenate();
}

private static readonly List<string> s_ExpectedErrors = new List<string>()
{
"The 'http://schemas.openxmlformats.org/wordprocessingml/2006/main:evenHBand' attribute is not declared.",
Expand Down
50 changes: 50 additions & 0 deletions OpenXmlPowerTools.Tests/DocumentBuilderTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,56 @@ public void DB012_NumberingsWithSameAbstractNumbering()
Assert.Equal(3, numberingRoot.Elements(W.num).Count());
}

[Fact]
public void DB012a_NumberingWithZeroIdIsValid()
{
// This document has a numbering definition with a zero id (explicitly indicating "no numbering").
var name = "DB012a-No-Numbering0.docx";
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
var sourceDocx = new FileInfo(Path.Combine(sourceDir.FullName, name));
var sources = new List<Source>()
{
new Source(new WmlDocument(sourceDocx.FullName)),
};
var processedDestDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName,
sourceDocx.Name.Replace(".docx", "-processed-by-DocumentBuilder.docx")));
DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName);
Validate(processedDestDocx);
}

[Fact]
public void DB012b_NumberingWithZeroIdWorks()
{
var sourceDir = new DirectoryInfo("../../../../TestFiles/");
var source0 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering0.docx"));
var source1 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering1.docx"));
var doc1 = new WmlDocument(source0.FullName);
using (var mem = new MemoryStream())
{
mem.Write(doc1.DocumentByteArray, 0, doc1.DocumentByteArray.Length);
using (var doc = WordprocessingDocument.Open(mem, true))
{
var xDoc = doc.MainDocumentPart.GetXDocument();
var frontMatterPara = xDoc.Root.Elements(W.body).Elements(W.p).FirstOrDefault();
frontMatterPara.ReplaceWith(
new XElement(PtOpenXml.Insert,
new XAttribute("Id", "Front")));
doc.MainDocumentPart.PutXDocument();
}
doc1.DocumentByteArray = mem.ToArray();
}

var sources = new List<Source>()
{
new Source(doc1, true),
new Source(new WmlDocument(source1.FullName), "Front"),
};
var processedDestDocx =
new FileInfo(Path.Combine(TestUtil.TempDir.FullName, "DB012b-NumberingWithZeroIdWorks.docx"));
DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName);
Validate(processedDestDocx);
}

[Fact]
public void DB013a_LocalizedStyleIds_Heading()
{
Expand Down
120 changes: 120 additions & 0 deletions OpenXmlPowerTools.Tests/UnicodeMapperTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -153,5 +153,125 @@ public void IgnoresTemporaryLayoutMarkers()
// characters) should exactly match the output of UnicodeMapper:
Assert.Equal(p.Value, actual);
}

private const string PreserveSpacingXmlString =
@"<w:document xmlns:w=""http://schemas.openxmlformats.org/wordprocessingml/2006/main"">
<w:body>
<w:p>
<w:r>
<w:t xml:space=""preserve"">The following space is retained: </w:t>
</w:r>
<w:r>
<w:t>but this one is not: </w:t>
</w:r>
<w:r>
<w:t xml:space=""preserve"">. Similarly these two lines should have only a space between them: </w:t>
</w:r>
<w:r>
<w:t>
Line 1!
Line 2!
</w:t>
</w:r>
</w:p>
</w:body>
</w:document>";

[Fact]
public void HonorsXmlSpace()
{
// This somewhat rudimentary test is superceded by TreatsXmlSpaceLikeWord() below,
// but it has been left in to provide a simple/direct illustration of a couple of
// the specific test cases covered by that more extensive suite.
XDocument partDocument = XDocument.Parse(PreserveSpacingXmlString);
XElement p = partDocument.Descendants(W.p).Last();
string innerText = p.Descendants(W.r)
.Select(UnicodeMapper.RunToString)
.StringConcatenate();
Assert.Equal(@"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!", innerText);
}

// Verifies that UnicodeMapper.RunToString interprets whitespace in <w:t> elements
// exactly the way Microsoft Word does, including honoring xml:space="preserve".
// This is essential because RunToString is used by higher‑level features
// (OpenXmlRegex, DocumentAssembler, etc.) that rely on its output to reflect the
// text an end‑user would actually see and edit in Word.
//
// Word accepts a wide range of “valid” DOCX input, but it normalizes that input
// into a canonical form when displaying or saving the document. These tests
// compare RunToString’s output against Word’s canonicalized output to ensure
// that whitespace is treated as semantic content in the same way Word treats it.
[Fact]
public void TreatsXmlSpaceLikeWord()
{
var sourceDir = new System.IO.DirectoryInfo("../../../../TestFiles/");
// Test document: crafted to include many whitespace patterns that Word accepts as valid input
var testDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-test.docx"));
var testWmlDoc = new WmlDocument(testDoc.FullName);
var testParagraphs = testWmlDoc.MainDocumentPart
.Element(W.body)
.Elements(W.p).ToList();
// Canonical document: the same test document after being opened and saved by Word,
// representing Word's own normalized interpretation of that whitespace
var expectedDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-Word-saved.docx"));
var expectedWmlDoc = new WmlDocument(expectedDoc.FullName);
var expectedParagraphs = expectedWmlDoc.MainDocumentPart
.Element(W.body)
.Elements(W.p).ToList();
// Iterate through pairs of paragraphs (test name, test content, expected result)
for (int i = 0; i < testParagraphs.Count - 1; i += 2)
{
var testNameParagraph = testParagraphs[i];
var testContentParagraph = testParagraphs[i + 1];
// Get the test name from the first paragraph
var testName = testNameParagraph.Descendants(W.t)
.Select(t => (string)t)
.StringConcatenate();
// Get the actual result by calling UnicodeMapper.RunToString on the test content runs
var actualResult = testContentParagraph.Descendants(W.r)
.Select(UnicodeMapper.RunToString)
.StringConcatenate();
// Find corresponding expected result paragraph (same index in expected document)
var expectedResult = ExtractExpectedFromWord(expectedParagraphs[i + 1]);
Assert.True(
expectedResult == actualResult,
$"Test '{testName}' failed. Expected: [{expectedResult}] Actual: [{actualResult}]"
);
}
}

// Extracts the expected text from Word’s canonicalized output for the whitespace tests.
// This helper intentionally handles *only* the constructs that Word emits in the saved
// version of UM-whitespace-test.docx:
// • <w:t> → literal text
// • <w:tab/> → '\t'
// • <w:lastRenderedPageBreak/> (intentionally ignored)
// If any other run-level element appears, it means Word has emitted something this test
// was not designed to handle, and the test fails loudly. This prevents the helper
// from drifting toward reimplementing UnicodeMapper.RunToString.
private static string ExtractExpectedFromWord(XElement p)
{
var sb = new System.Text.StringBuilder();
foreach (var run in p.Elements(W.r))
{
foreach (var child in run.Elements())
{
if (child.Name == W.t)
{
sb.Append((string)child);
}
else if (child.Name == W.tab)
{
sb.Append('\t');
}
else if (child.Name != W.lastRenderedPageBreak)
{
throw new System.InvalidOperationException(
$"Unexpected element <{child.Name.LocalName}> encountered in expected Word output.");
}
}
}
return sb.ToString();
}
}
}
34 changes: 31 additions & 3 deletions OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -654,7 +654,7 @@ private class RunReplacementInfo
p.Add(new XElement(W.r,
para.Elements(W.r).Elements(W.rPr).FirstOrDefault(),
(p.Elements().Count() > 1) ? new XElement(W.br) : null,
new XElement(W.t, line)));
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
}
return p;
}
Expand All @@ -666,7 +666,7 @@ private class RunReplacementInfo
list.Add(new XElement(W.r,
run.Elements().Where(e => e.Name != W.t),
(list.Count > 0) ? new XElement(W.br) : null,
new XElement(W.t, line)));
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
}
return list;
}
Expand Down Expand Up @@ -873,9 +873,24 @@ private class RunReplacementInfo
}
return null;
}
var transformedNodes = element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart));
if (element.Name == W.tc)
{
// Check if the table cell contains any block-level elements
// Valid block-level elements in a table cell: p (paragraph), tbl (table), sdt (structured document tag), customXml
var nodesList = transformedNodes.ToList();
var hasBlockLevelContent = nodesList.Any(n => n is XElement xe &&
(xe.Name == W.p || xe.Name == W.tbl || xe.Name == W.sdt || xe.Name == W.customXml));
if (!hasBlockLevelContent)
{
// Table cells must contain at least one block-level element -- add an empty paragraph
nodesList.Add(new XElement(W.p));
}
transformedNodes = nodesList;
}
return new XElement(element.Name,
element.Attributes(),
element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart)));
transformedNodes);
}
return node;
}
Expand Down Expand Up @@ -1400,5 +1415,18 @@ private static string EvaluateXPathToString(XElement element, string xPath, bool

return xPathSelectResult.ToString();
}

private static XAttribute GetXmlSpaceAttribute(string textOfTextElement)
{
if (!string.IsNullOrEmpty(textOfTextElement))
{
if (char.IsWhiteSpace(textOfTextElement[0]) ||
char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1]))
{
return new XAttribute(XNamespace.Xml + "space", "preserve");
}
Comment on lines +1421 to +1427
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These 'if' statements can be combined.

Suggested change
if (!string.IsNullOrEmpty(textOfTextElement))
{
if (char.IsWhiteSpace(textOfTextElement[0]) ||
char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1]))
{
return new XAttribute(XNamespace.Xml + "space", "preserve");
}
if (!string.IsNullOrEmpty(textOfTextElement) &&
(char.IsWhiteSpace(textOfTextElement[0]) ||
char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1])))
{
return new XAttribute(XNamespace.Xml + "space", "preserve");

Copilot uses AI. Check for mistakes.
}
return null;
}
Comment on lines +1419 to +1430
Copy link

Copilot AI Jan 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The GetXmlSpaceAttribute method should return null explicitly when the condition is not met instead of relying on implicit null return. This makes the intent clearer and follows best practices for nullable return types.

Consider adding an explicit return null; statement after the closing brace of the if block for better code clarity.

Copilot uses AI. Check for mistakes.
}
}
Loading
Loading