Skip to content

Commit 02a7167

Browse files
authored
Merge pull request #183 from opendocx/fix/whitespace-handling
Fix whitespace handling in UnicodeMapper and DocumentAssembler
2 parents f56266e + aa10c34 commit 02a7167

File tree

8 files changed

+217
-3
lines changed

8 files changed

+217
-3
lines changed

OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,24 @@ public void DA259(string name, string data, bool err)
157157
Assert.Equal(4, brCount);
158158
}
159159

160+
[Fact]
161+
public void DA240()
162+
{
163+
string name = "DA240-Whitespace.docx";
164+
DA101(name, "DA240-Whitespace.xml", false);
165+
var assembledDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".docx", "-processed-by-DocumentAssembler.docx")));
166+
WmlDocument afterAssembling = new WmlDocument(assembledDocx.FullName);
167+
168+
// when elements are inserted that begin or end with white space, make sure white space is preserved
169+
string firstParaTextIncorrect = afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First().Value;
170+
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextIncorrect);
171+
// warning: XElement.Value returns the string resulting from direct concatenation of all W.t elements. This is fast but ignores
172+
// proper handling of xml:space="preserve" attributes, which Word honors when rendering content. Below we also check
173+
// the result of UnicodeMapper.RunToString, which has been enhanced to take xml:space="preserve" into account.
174+
string firstParaTextCorrect = InnerText(afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First());
175+
Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextCorrect);
176+
}
177+
160178
[Theory]
161179
[InlineData("DA024-TrackedRevisions.docx", "DA-Data.xml")]
162180
public void DA102_Throws(string name, string data)
@@ -488,6 +506,15 @@ private static string GetDocumentText(WmlDocument document)
488506
private const string WidePngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAACuUlEQVR4nO3UMQ7CQBAEwT3EvxEvXz/BZKalqniCifrs7gAUvGfmnO/TNwBu7H5edxuAfyFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFFzMzu2y8A5u4PQZkIj89BEMEAAAAASUVORK5CYII=";
489507
private const string TallPngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAMgAAAGQCAIAAABkkLjnAAAEF0lEQVR4nO3S0QkCURAEwX1i3mLke0lcI3hVAQzz0Wd3B+72npnzPbfv8mT72devP/CfhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkzu42y8yTXVnDDBu1Y983AAAAAElFTkSuQmCC";
490508
private const string TruncatedGifBase64 = "R0lGODlhyABQAA==";
509+
510+
private static string InnerText(XContainer e)
511+
{
512+
return e.Descendants(W.r)
513+
.Where(r => r.Parent.Name != W.del)
514+
.Select(UnicodeMapper.RunToString)
515+
.StringConcatenate();
516+
}
517+
491518
private static readonly List<string> s_ExpectedErrors = new List<string>()
492519
{
493520
"The 'http://schemas.openxmlformats.org/wordprocessingml/2006/main:evenHBand' attribute is not declared.",

OpenXmlPowerTools.Tests/UnicodeMapperTests.cs

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,5 +153,125 @@ public void IgnoresTemporaryLayoutMarkers()
153153
// characters) should exactly match the output of UnicodeMapper:
154154
Assert.Equal(p.Value, actual);
155155
}
156+
157+
private const string PreserveSpacingXmlString =
158+
@"<w:document xmlns:w=""http://schemas.openxmlformats.org/wordprocessingml/2006/main"">
159+
<w:body>
160+
<w:p>
161+
<w:r>
162+
<w:t xml:space=""preserve"">The following space is retained: </w:t>
163+
</w:r>
164+
<w:r>
165+
<w:t>but this one is not: </w:t>
166+
</w:r>
167+
<w:r>
168+
<w:t xml:space=""preserve"">. Similarly these two lines should have only a space between them: </w:t>
169+
</w:r>
170+
<w:r>
171+
<w:t>
172+
Line 1!
173+
Line 2!
174+
</w:t>
175+
</w:r>
176+
</w:p>
177+
</w:body>
178+
</w:document>";
179+
180+
[Fact]
181+
public void HonorsXmlSpace()
182+
{
183+
// This somewhat rudimentary test is superceded by TreatsXmlSpaceLikeWord() below,
184+
// but it has been left in to provide a simple/direct illustration of a couple of
185+
// the specific test cases covered by that more extensive suite.
186+
XDocument partDocument = XDocument.Parse(PreserveSpacingXmlString);
187+
XElement p = partDocument.Descendants(W.p).Last();
188+
string innerText = p.Descendants(W.r)
189+
.Select(UnicodeMapper.RunToString)
190+
.StringConcatenate();
191+
Assert.Equal(@"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!", innerText);
192+
}
193+
194+
// Verifies that UnicodeMapper.RunToString interprets whitespace in <w:t> elements
195+
// exactly the way Microsoft Word does, including honoring xml:space="preserve".
196+
// This is essential because RunToString is used by higher‑level features
197+
// (OpenXmlRegex, DocumentAssembler, etc.) that rely on its output to reflect the
198+
// text an end‑user would actually see and edit in Word.
199+
//
200+
// Word accepts a wide range of “valid” DOCX input, but it normalizes that input
201+
// into a canonical form when displaying or saving the document. These tests
202+
// compare RunToString’s output against Word’s canonicalized output to ensure
203+
// that whitespace is treated as semantic content in the same way Word treats it.
204+
[Fact]
205+
public void TreatsXmlSpaceLikeWord()
206+
{
207+
var sourceDir = new System.IO.DirectoryInfo("../../../../TestFiles/");
208+
// Test document: crafted to include many whitespace patterns that Word accepts as valid input
209+
var testDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-test.docx"));
210+
var testWmlDoc = new WmlDocument(testDoc.FullName);
211+
var testParagraphs = testWmlDoc.MainDocumentPart
212+
.Element(W.body)
213+
.Elements(W.p).ToList();
214+
// Canonical document: the same test document after being opened and saved by Word,
215+
// representing Word's own normalized interpretation of that whitespace
216+
var expectedDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-Word-saved.docx"));
217+
var expectedWmlDoc = new WmlDocument(expectedDoc.FullName);
218+
var expectedParagraphs = expectedWmlDoc.MainDocumentPart
219+
.Element(W.body)
220+
.Elements(W.p).ToList();
221+
// Iterate through pairs of paragraphs (test name, test content, expected result)
222+
for (int i = 0; i < testParagraphs.Count - 1; i += 2)
223+
{
224+
var testNameParagraph = testParagraphs[i];
225+
var testContentParagraph = testParagraphs[i + 1];
226+
// Get the test name from the first paragraph
227+
var testName = testNameParagraph.Descendants(W.t)
228+
.Select(t => (string)t)
229+
.StringConcatenate();
230+
// Get the actual result by calling UnicodeMapper.RunToString on the test content runs
231+
var actualResult = testContentParagraph.Descendants(W.r)
232+
.Select(UnicodeMapper.RunToString)
233+
.StringConcatenate();
234+
// Find corresponding expected result paragraph (same index in expected document)
235+
var expectedResult = ExtractExpectedFromWord(expectedParagraphs[i + 1]);
236+
Assert.True(
237+
expectedResult == actualResult,
238+
$"Test '{testName}' failed. Expected: [{expectedResult}] Actual: [{actualResult}]"
239+
);
240+
}
241+
}
242+
243+
// Extracts the expected text from Word’s canonicalized output for the whitespace tests.
244+
// This helper intentionally handles *only* the constructs that Word emits in the saved
245+
// version of UM-whitespace-test.docx:
246+
// • <w:t> → literal text
247+
// • <w:tab/> → '\t'
248+
// • <w:lastRenderedPageBreak/> (intentionally ignored)
249+
// If any other run-level element appears, it means Word has emitted something this test
250+
// was not designed to handle, and the test fails loudly. This prevents the helper
251+
// from drifting toward reimplementing UnicodeMapper.RunToString.
252+
private static string ExtractExpectedFromWord(XElement p)
253+
{
254+
var sb = new System.Text.StringBuilder();
255+
foreach (var run in p.Elements(W.r))
256+
{
257+
foreach (var child in run.Elements())
258+
{
259+
if (child.Name == W.t)
260+
{
261+
sb.Append((string)child);
262+
}
263+
else if (child.Name == W.tab)
264+
{
265+
sb.Append('\t');
266+
}
267+
else if (child.Name != W.lastRenderedPageBreak)
268+
{
269+
throw new System.InvalidOperationException(
270+
$"Unexpected element <{child.Name.LocalName}> encountered in expected Word output.");
271+
}
272+
}
273+
}
274+
return sb.ToString();
275+
}
156276
}
157277
}

OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ private class RunReplacementInfo
654654
p.Add(new XElement(W.r,
655655
para.Elements(W.r).Elements(W.rPr).FirstOrDefault(),
656656
(p.Elements().Count() > 1) ? new XElement(W.br) : null,
657-
new XElement(W.t, line)));
657+
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
658658
}
659659
return p;
660660
}
@@ -666,7 +666,7 @@ private class RunReplacementInfo
666666
list.Add(new XElement(W.r,
667667
run.Elements().Where(e => e.Name != W.t),
668668
(list.Count > 0) ? new XElement(W.br) : null,
669-
new XElement(W.t, line)));
669+
new XElement(W.t, GetXmlSpaceAttribute(line), line)));
670670
}
671671
return list;
672672
}
@@ -1415,5 +1415,18 @@ private static string EvaluateXPathToString(XElement element, string xPath, bool
14151415

14161416
return xPathSelectResult.ToString();
14171417
}
1418+
1419+
private static XAttribute GetXmlSpaceAttribute(string textOfTextElement)
1420+
{
1421+
if (!string.IsNullOrEmpty(textOfTextElement))
1422+
{
1423+
if (char.IsWhiteSpace(textOfTextElement[0]) ||
1424+
char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1]))
1425+
{
1426+
return new XAttribute(XNamespace.Xml + "space", "preserve");
1427+
}
1428+
}
1429+
return null;
1430+
}
14181431
}
14191432
}

OpenXmlPowerTools/UnicodeMapper.cs

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,10 @@ public static string RunToString(XElement element)
6060
// For w:t elements, we obviously want the element's value.
6161
if (element.Name == W.t)
6262
{
63-
return (string)element;
63+
// Emulate Word's handling of the xml:space attribute on text elements
64+
XAttribute? spaceAttribute = element.Attribute(XNamespace.Xml + "space");
65+
string? space = spaceAttribute?.Value;
66+
return NormalizeWhitespace((string) element, space == "preserve");
6467
}
6568

6669
// Turn elements representing special characters into their corresponding
@@ -141,6 +144,50 @@ public static string RunToString(XElement element)
141144
return StartOfHeading.ToString();
142145
}
143146

147+
/// <summary>
148+
/// Emulate the way Word interprets the content of text elements
149+
/// depending on whether the xml:space="preserve" attribute is present.
150+
/// </summary>
151+
/// <param name="text">The entire content of the w:t element.</param>
152+
/// <returns>The corresponding text string Word would display, print, save,
153+
/// and allow to be edited.</returns>
154+
private static string NormalizeWhitespace(string text, bool preserve)
155+
{
156+
if (string.IsNullOrEmpty(text))
157+
return string.Empty;
158+
// Trim leading & trailing whitespace when NOT preserving
159+
ReadOnlySpan<char> span = preserve
160+
? text.AsSpan()
161+
: text.AsSpan().Trim();
162+
if (span.Length == 0)
163+
return string.Empty;
164+
var sb = new System.Text.StringBuilder(span.Length);
165+
int i = 0;
166+
while (i < span.Length)
167+
{
168+
char c = span[i];
169+
switch (c)
170+
{
171+
case '\r': // CR or CRLF → space
172+
sb.Append(' ');
173+
if (i + 1 < span.Length && span[i + 1] == '\n')
174+
i++; // skip LF so CRLF becomes one space
175+
break;
176+
case '\n': // LF → space
177+
sb.Append(' ');
178+
break;
179+
case '\t': // TAB preserved or converted to space, depending on mode
180+
sb.Append(preserve ? c : ' ');
181+
break;
182+
default: // SPACE or any other character → preserved exactly
183+
sb.Append(c);
184+
break;
185+
}
186+
i++;
187+
}
188+
return sb.ToString();
189+
}
190+
144191
/// <summary>
145192
/// Translate a symbol into a Unicode character, using the specified w:font attribute
146193
/// value and unicode value (represented by the w:sym element's w:char attribute),

TestFiles/DA240-Whitespace.docx

16.1 KB
Binary file not shown.

TestFiles/DA240-Whitespace.xml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<?xml version="1.0"?>
2+
<Test240>
3+
<MayOrMayNot>may or may not</MayOrMayNot>
4+
<Join1>/</Join1>
5+
<Join2>, </Join2>
6+
<Join3> and </Join3>
7+
</Test240>
15.9 KB
Binary file not shown.

TestFiles/UM-Whitespace-test.docx

13 KB
Binary file not shown.

0 commit comments

Comments
 (0)