@@ -153,5 +153,125 @@ public void IgnoresTemporaryLayoutMarkers()
153153 // characters) should exactly match the output of UnicodeMapper:
154154 Assert . Equal ( p . Value , actual ) ;
155155 }
156+
157+ private const string PreserveSpacingXmlString =
158+ @"<w:document xmlns:w=""http://schemas.openxmlformats.org/wordprocessingml/2006/main"">
159+ <w:body>
160+ <w:p>
161+ <w:r>
162+ <w:t xml:space=""preserve"">The following space is retained: </w:t>
163+ </w:r>
164+ <w:r>
165+ <w:t>but this one is not: </w:t>
166+ </w:r>
167+ <w:r>
168+ <w:t xml:space=""preserve"">. Similarly these two lines should have only a space between them: </w:t>
169+ </w:r>
170+ <w:r>
171+ <w:t>
172+ Line 1!
173+ Line 2!
174+ </w:t>
175+ </w:r>
176+ </w:p>
177+ </w:body>
178+ </w:document>" ;
179+
180+ [ Fact ]
181+ public void HonorsXmlSpace ( )
182+ {
183+ // This somewhat rudimentary test is superceded by TreatsXmlSpaceLikeWord() below,
184+ // but it has been left in to provide a simple/direct illustration of a couple of
185+ // the specific test cases covered by that more extensive suite.
186+ XDocument partDocument = XDocument . Parse ( PreserveSpacingXmlString ) ;
187+ XElement p = partDocument . Descendants ( W . p ) . Last ( ) ;
188+ string innerText = p . Descendants ( W . r )
189+ . Select ( UnicodeMapper . RunToString )
190+ . StringConcatenate ( ) ;
191+ Assert . Equal ( @"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!" , innerText ) ;
192+ }
193+
194+ // Verifies that UnicodeMapper.RunToString interprets whitespace in <w:t> elements
195+ // exactly the way Microsoft Word does, including honoring xml:space="preserve".
196+ // This is essential because RunToString is used by higher‑level features
197+ // (OpenXmlRegex, DocumentAssembler, etc.) that rely on its output to reflect the
198+ // text an end‑user would actually see and edit in Word.
199+ //
200+ // Word accepts a wide range of “valid” DOCX input, but it normalizes that input
201+ // into a canonical form when displaying or saving the document. These tests
202+ // compare RunToString’s output against Word’s canonicalized output to ensure
203+ // that whitespace is treated as semantic content in the same way Word treats it.
204+ [ Fact ]
205+ public void TreatsXmlSpaceLikeWord ( )
206+ {
207+ var sourceDir = new System . IO . DirectoryInfo ( "../../../../TestFiles/" ) ;
208+ // Test document: crafted to include many whitespace patterns that Word accepts as valid input
209+ var testDoc = new System . IO . FileInfo ( System . IO . Path . Combine ( sourceDir . FullName , "UM-Whitespace-test.docx" ) ) ;
210+ var testWmlDoc = new WmlDocument ( testDoc . FullName ) ;
211+ var testParagraphs = testWmlDoc . MainDocumentPart
212+ . Element ( W . body )
213+ . Elements ( W . p ) . ToList ( ) ;
214+ // Canonical document: the same test document after being opened and saved by Word,
215+ // representing Word's own normalized interpretation of that whitespace
216+ var expectedDoc = new System . IO . FileInfo ( System . IO . Path . Combine ( sourceDir . FullName , "UM-Whitespace-Word-saved.docx" ) ) ;
217+ var expectedWmlDoc = new WmlDocument ( expectedDoc . FullName ) ;
218+ var expectedParagraphs = expectedWmlDoc . MainDocumentPart
219+ . Element ( W . body )
220+ . Elements ( W . p ) . ToList ( ) ;
221+ // Iterate through pairs of paragraphs (test name, test content, expected result)
222+ for ( int i = 0 ; i < testParagraphs . Count - 1 ; i += 2 )
223+ {
224+ var testNameParagraph = testParagraphs [ i ] ;
225+ var testContentParagraph = testParagraphs [ i + 1 ] ;
226+ // Get the test name from the first paragraph
227+ var testName = testNameParagraph . Descendants ( W . t )
228+ . Select ( t => ( string ) t )
229+ . StringConcatenate ( ) ;
230+ // Get the actual result by calling UnicodeMapper.RunToString on the test content runs
231+ var actualResult = testContentParagraph . Descendants ( W . r )
232+ . Select ( UnicodeMapper . RunToString )
233+ . StringConcatenate ( ) ;
234+ // Find corresponding expected result paragraph (same index in expected document)
235+ var expectedResult = ExtractExpectedFromWord ( expectedParagraphs [ i + 1 ] ) ;
236+ Assert . True (
237+ expectedResult == actualResult ,
238+ $ "Test '{ testName } ' failed. Expected: [{ expectedResult } ] Actual: [{ actualResult } ]"
239+ ) ;
240+ }
241+ }
242+
243+ // Extracts the expected text from Word’s canonicalized output for the whitespace tests.
244+ // This helper intentionally handles *only* the constructs that Word emits in the saved
245+ // version of UM-whitespace-test.docx:
246+ // • <w:t> → literal text
247+ // • <w:tab/> → '\t'
248+ // • <w:lastRenderedPageBreak/> (intentionally ignored)
249+ // If any other run-level element appears, it means Word has emitted something this test
250+ // was not designed to handle, and the test fails loudly. This prevents the helper
251+ // from drifting toward reimplementing UnicodeMapper.RunToString.
252+ private static string ExtractExpectedFromWord ( XElement p )
253+ {
254+ var sb = new System . Text . StringBuilder ( ) ;
255+ foreach ( var run in p . Elements ( W . r ) )
256+ {
257+ foreach ( var child in run . Elements ( ) )
258+ {
259+ if ( child . Name == W . t )
260+ {
261+ sb . Append ( ( string ) child ) ;
262+ }
263+ else if ( child . Name == W . tab )
264+ {
265+ sb . Append ( '\t ' ) ;
266+ }
267+ else if ( child . Name != W . lastRenderedPageBreak )
268+ {
269+ throw new System . InvalidOperationException (
270+ $ "Unexpected element <{ child . Name . LocalName } > encountered in expected Word output.") ;
271+ }
272+ }
273+ }
274+ return sb . ToString ( ) ;
275+ }
156276 }
157277}
0 commit comments