diff --git a/OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs b/OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs index cdbdb3bb..8add91d7 100644 --- a/OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs +++ b/OpenXmlPowerTools.Tests/DocumentAssemblerTests.cs @@ -107,6 +107,7 @@ public class DaTests [InlineData("DA264-InvalidRunLevelRepeat.docx", "DA-Data.xml", true)] [InlineData("DA265-RunLevelRepeatWithWhiteSpaceBefore.docx", "DA-Data.xml", false)] [InlineData("DA266-RunLevelRepeat-NoData.docx", "DA-Data.xml", true)] + [InlineData("DA268-Block-Conditional-In-Table-Cell.docx", "DA268-data.xml", false)] public void DA101(string name, string data, bool err) { var sourceDir = new DirectoryInfo("../../../../TestFiles/"); @@ -156,6 +157,24 @@ public void DA259(string name, string data, bool err) Assert.Equal(4, brCount); } + [Fact] + public void DA240() + { + string name = "DA240-Whitespace.docx"; + DA101(name, "DA240-Whitespace.xml", false); + var assembledDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, name.Replace(".docx", "-processed-by-DocumentAssembler.docx"))); + WmlDocument afterAssembling = new WmlDocument(assembledDocx.FullName); + + // when elements are inserted that begin or end with white space, make sure white space is preserved + string firstParaTextIncorrect = afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First().Value; + Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextIncorrect); + // warning: XElement.Value returns the string resulting from direct concatenation of all W.t elements. This is fast but ignores + // proper handling of xml:space="preserve" attributes, which Word honors when rendering content. Below we also check + // the result of UnicodeMapper.RunToString, which has been enhanced to take xml:space="preserve" into account. + string firstParaTextCorrect = InnerText(afterAssembling.MainDocumentPart.Element(W.body).Elements(W.p).First()); + Assert.Equal("Content may or may not have spaces: he/she; he, she; he and she.", firstParaTextCorrect); + } + [Theory] [InlineData("DA024-TrackedRevisions.docx", "DA-Data.xml")] public void DA102_Throws(string name, string data) @@ -487,6 +506,15 @@ private static string GetDocumentText(WmlDocument document) private const string WidePngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAZAAAADICAIAAABJdyC1AAACuUlEQVR4nO3UMQ7CQBAEwT3EvxEvXz/BZKalqniCifrs7gAUvGfmnO/TNwBu7H5edxuAfyFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFZAgWkCFYQIZgARmCBWQIFpAhWECGYAEZggVkCBaQIVhAhmABGYIFFzMzu2y8A5u4PQZkIj89BEMEAAAAASUVORK5CYII="; private const string TallPngBase64 = "iVBORw0KGgoAAAANSUhEUgAAAMgAAAGQCAIAAABkkLjnAAAEF0lEQVR4nO3S0QkCURAEwX1i3mLke0lcI3hVAQzz0Wd3B+72npnzPbfv8mT72devP/CfhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkhEVCWCSERUJYJIRFQlgkzu42y8yTXVnDDBu1Y983AAAAAElFTkSuQmCC"; private const string TruncatedGifBase64 = "R0lGODlhyABQAA=="; + + private static string InnerText(XContainer e) + { + return e.Descendants(W.r) + .Where(r => r.Parent.Name != W.del) + .Select(UnicodeMapper.RunToString) + .StringConcatenate(); + } + private static readonly List s_ExpectedErrors = new List() { "The 'http://schemas.openxmlformats.org/wordprocessingml/2006/main:evenHBand' attribute is not declared.", diff --git a/OpenXmlPowerTools.Tests/DocumentBuilderTests.cs b/OpenXmlPowerTools.Tests/DocumentBuilderTests.cs index 55b69ca9..13fc63b9 100644 --- a/OpenXmlPowerTools.Tests/DocumentBuilderTests.cs +++ b/OpenXmlPowerTools.Tests/DocumentBuilderTests.cs @@ -472,6 +472,56 @@ public void DB012_NumberingsWithSameAbstractNumbering() Assert.Equal(3, numberingRoot.Elements(W.num).Count()); } + [Fact] + public void DB012a_NumberingWithZeroIdIsValid() + { + // This document has a numbering definition with a zero id (explicitly indicating "no numbering"). + var name = "DB012a-No-Numbering0.docx"; + var sourceDir = new DirectoryInfo("../../../../TestFiles/"); + var sourceDocx = new FileInfo(Path.Combine(sourceDir.FullName, name)); + var sources = new List() + { + new Source(new WmlDocument(sourceDocx.FullName)), + }; + var processedDestDocx = new FileInfo(Path.Combine(TestUtil.TempDir.FullName, + sourceDocx.Name.Replace(".docx", "-processed-by-DocumentBuilder.docx"))); + DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName); + Validate(processedDestDocx); + } + + [Fact] + public void DB012b_NumberingWithZeroIdWorks() + { + var sourceDir = new DirectoryInfo("../../../../TestFiles/"); + var source0 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering0.docx")); + var source1 = new FileInfo(Path.Combine(sourceDir.FullName, "DB012a-No-Numbering1.docx")); + var doc1 = new WmlDocument(source0.FullName); + using (var mem = new MemoryStream()) + { + mem.Write(doc1.DocumentByteArray, 0, doc1.DocumentByteArray.Length); + using (var doc = WordprocessingDocument.Open(mem, true)) + { + var xDoc = doc.MainDocumentPart.GetXDocument(); + var frontMatterPara = xDoc.Root.Elements(W.body).Elements(W.p).FirstOrDefault(); + frontMatterPara.ReplaceWith( + new XElement(PtOpenXml.Insert, + new XAttribute("Id", "Front"))); + doc.MainDocumentPart.PutXDocument(); + } + doc1.DocumentByteArray = mem.ToArray(); + } + + var sources = new List() + { + new Source(doc1, true), + new Source(new WmlDocument(source1.FullName), "Front"), + }; + var processedDestDocx = + new FileInfo(Path.Combine(TestUtil.TempDir.FullName, "DB012b-NumberingWithZeroIdWorks.docx")); + DocumentBuilder.BuildDocument(sources, processedDestDocx.FullName); + Validate(processedDestDocx); + } + [Fact] public void DB013a_LocalizedStyleIds_Heading() { diff --git a/OpenXmlPowerTools.Tests/UnicodeMapperTests.cs b/OpenXmlPowerTools.Tests/UnicodeMapperTests.cs index 667695d2..0e129975 100644 --- a/OpenXmlPowerTools.Tests/UnicodeMapperTests.cs +++ b/OpenXmlPowerTools.Tests/UnicodeMapperTests.cs @@ -153,5 +153,125 @@ public void IgnoresTemporaryLayoutMarkers() // characters) should exactly match the output of UnicodeMapper: Assert.Equal(p.Value, actual); } + + private const string PreserveSpacingXmlString = +@" + + + + The following space is retained: + + + but this one is not: + + + . Similarly these two lines should have only a space between them: + + + + Line 1! +Line 2! + + + + +"; + + [Fact] + public void HonorsXmlSpace() + { + // This somewhat rudimentary test is superceded by TreatsXmlSpaceLikeWord() below, + // but it has been left in to provide a simple/direct illustration of a couple of + // the specific test cases covered by that more extensive suite. + XDocument partDocument = XDocument.Parse(PreserveSpacingXmlString); + XElement p = partDocument.Descendants(W.p).Last(); + string innerText = p.Descendants(W.r) + .Select(UnicodeMapper.RunToString) + .StringConcatenate(); + Assert.Equal(@"The following space is retained: but this one is not:. Similarly these two lines should have only a space between them: Line 1! Line 2!", innerText); + } + + // Verifies that UnicodeMapper.RunToString interprets whitespace in elements + // exactly the way Microsoft Word does, including honoring xml:space="preserve". + // This is essential because RunToString is used by higher‑level features + // (OpenXmlRegex, DocumentAssembler, etc.) that rely on its output to reflect the + // text an end‑user would actually see and edit in Word. + // + // Word accepts a wide range of “valid” DOCX input, but it normalizes that input + // into a canonical form when displaying or saving the document. These tests + // compare RunToString’s output against Word’s canonicalized output to ensure + // that whitespace is treated as semantic content in the same way Word treats it. + [Fact] + public void TreatsXmlSpaceLikeWord() + { + var sourceDir = new System.IO.DirectoryInfo("../../../../TestFiles/"); + // Test document: crafted to include many whitespace patterns that Word accepts as valid input + var testDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-test.docx")); + var testWmlDoc = new WmlDocument(testDoc.FullName); + var testParagraphs = testWmlDoc.MainDocumentPart + .Element(W.body) + .Elements(W.p).ToList(); + // Canonical document: the same test document after being opened and saved by Word, + // representing Word's own normalized interpretation of that whitespace + var expectedDoc = new System.IO.FileInfo(System.IO.Path.Combine(sourceDir.FullName, "UM-Whitespace-Word-saved.docx")); + var expectedWmlDoc = new WmlDocument(expectedDoc.FullName); + var expectedParagraphs = expectedWmlDoc.MainDocumentPart + .Element(W.body) + .Elements(W.p).ToList(); + // Iterate through pairs of paragraphs (test name, test content, expected result) + for (int i = 0; i < testParagraphs.Count - 1; i += 2) + { + var testNameParagraph = testParagraphs[i]; + var testContentParagraph = testParagraphs[i + 1]; + // Get the test name from the first paragraph + var testName = testNameParagraph.Descendants(W.t) + .Select(t => (string)t) + .StringConcatenate(); + // Get the actual result by calling UnicodeMapper.RunToString on the test content runs + var actualResult = testContentParagraph.Descendants(W.r) + .Select(UnicodeMapper.RunToString) + .StringConcatenate(); + // Find corresponding expected result paragraph (same index in expected document) + var expectedResult = ExtractExpectedFromWord(expectedParagraphs[i + 1]); + Assert.True( + expectedResult == actualResult, + $"Test '{testName}' failed. Expected: [{expectedResult}] Actual: [{actualResult}]" + ); + } + } + + // Extracts the expected text from Word’s canonicalized output for the whitespace tests. + // This helper intentionally handles *only* the constructs that Word emits in the saved + // version of UM-whitespace-test.docx: + // • → literal text + // • → '\t' + // • (intentionally ignored) + // If any other run-level element appears, it means Word has emitted something this test + // was not designed to handle, and the test fails loudly. This prevents the helper + // from drifting toward reimplementing UnicodeMapper.RunToString. + private static string ExtractExpectedFromWord(XElement p) + { + var sb = new System.Text.StringBuilder(); + foreach (var run in p.Elements(W.r)) + { + foreach (var child in run.Elements()) + { + if (child.Name == W.t) + { + sb.Append((string)child); + } + else if (child.Name == W.tab) + { + sb.Append('\t'); + } + else if (child.Name != W.lastRenderedPageBreak) + { + throw new System.InvalidOperationException( + $"Unexpected element <{child.Name.LocalName}> encountered in expected Word output."); + } + } + } + return sb.ToString(); + } } } \ No newline at end of file diff --git a/OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs b/OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs index a639ce5d..38f8372e 100644 --- a/OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs +++ b/OpenXmlPowerTools/DocumentAssembler/DocumentAssembler.cs @@ -654,7 +654,7 @@ private class RunReplacementInfo p.Add(new XElement(W.r, para.Elements(W.r).Elements(W.rPr).FirstOrDefault(), (p.Elements().Count() > 1) ? new XElement(W.br) : null, - new XElement(W.t, line))); + new XElement(W.t, GetXmlSpaceAttribute(line), line))); } return p; } @@ -666,7 +666,7 @@ private class RunReplacementInfo list.Add(new XElement(W.r, run.Elements().Where(e => e.Name != W.t), (list.Count > 0) ? new XElement(W.br) : null, - new XElement(W.t, line))); + new XElement(W.t, GetXmlSpaceAttribute(line), line))); } return list; } @@ -873,9 +873,24 @@ private class RunReplacementInfo } return null; } + var transformedNodes = element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart)); + if (element.Name == W.tc) + { + // Check if the table cell contains any block-level elements + // Valid block-level elements in a table cell: p (paragraph), tbl (table), sdt (structured document tag), customXml + var nodesList = transformedNodes.ToList(); + var hasBlockLevelContent = nodesList.Any(n => n is XElement xe && + (xe.Name == W.p || xe.Name == W.tbl || xe.Name == W.sdt || xe.Name == W.customXml)); + if (!hasBlockLevelContent) + { + // Table cells must contain at least one block-level element -- add an empty paragraph + nodesList.Add(new XElement(W.p)); + } + transformedNodes = nodesList; + } return new XElement(element.Name, element.Attributes(), - element.Nodes().Select(n => ContentReplacementTransform(n, data, templateError, owningPart))); + transformedNodes); } return node; } @@ -1400,5 +1415,18 @@ private static string EvaluateXPathToString(XElement element, string xPath, bool return xPathSelectResult.ToString(); } + + private static XAttribute GetXmlSpaceAttribute(string textOfTextElement) + { + if (!string.IsNullOrEmpty(textOfTextElement)) + { + if (char.IsWhiteSpace(textOfTextElement[0]) || + char.IsWhiteSpace(textOfTextElement[textOfTextElement.Length - 1])) + { + return new XAttribute(XNamespace.Xml + "space", "preserve"); + } + } + return null; + } } } diff --git a/OpenXmlPowerTools/DocumentBuilder/DocumentBuilder.cs b/OpenXmlPowerTools/DocumentBuilder/DocumentBuilder.cs index b2bbd452..1260a5df 100644 --- a/OpenXmlPowerTools/DocumentBuilder/DocumentBuilder.cs +++ b/OpenXmlPowerTools/DocumentBuilder/DocumentBuilder.cs @@ -1,4 +1,4 @@ -#define TestForUnsupportedDocuments +#define TestForUnsupportedDocuments #define MergeStylesWithSameNames using Codeuctivity.OpenXmlPowerTools.Exceptions; @@ -1411,9 +1411,11 @@ private static void TestForUnsupportedDocument(WordprocessingDocument doc, int s .Descendants(W.numPr) .Where(n => { - var zeroId = (int?)n.Attribute(W.id) == 0; + var zeroId = (int?)n.Attribute(W.id) == 0; // nonstandard but handle defensively + var numIdElement = n.Element(W.numId); // standard OpenXML has numId as child element + var hasZeroNumId = numIdElement != null && (int?)numIdElement.Attribute(W.val) == 0; var hasChildInsId = n.Elements(W.ins).Any(); - if (zeroId || hasChildInsId) + if (hasZeroNumId || zeroId || hasChildInsId) { return false; } @@ -2747,6 +2749,12 @@ private static void CopyNumbering(WordprocessingDocument sourceDocument, Wordpro var idElement = numReference.Descendants(W.numId).FirstOrDefault(); if (idElement != null) { + var numId = (int)idElement.Attribute(W.val); + if (numId == 0) // indicates "no numbering" + { + continue; // skip processing + } + if (oldNumbering == null) { oldNumbering = sourceDocument.MainDocumentPart.NumberingDefinitionsPart.GetXDocument(); @@ -2784,111 +2792,107 @@ private static void CopyNumbering(WordprocessingDocument sourceDocument, Wordpro newNumbering.Add(new XElement(W.numbering, NamespaceAttributes)); } } - var numId = (int)idElement.Attribute(W.val); - if (numId != 0) + var element = oldNumbering + .Descendants(W.num) + .FirstOrDefault(p => (int)p.Attribute(W.numId) == numId); + if (element == null) { - var element = oldNumbering - .Descendants(W.num) -.FirstOrDefault(p => (int)p.Attribute(W.numId) == numId); - if (element == null) - { - continue; - } + continue; + } - // Copy abstract numbering element, if necessary (use matching NSID) - var abstractNumIdStr = (string)element - .Elements(W.abstractNumId) - .First() + // Copy abstract numbering element, if necessary (use matching NSID) + var abstractNumIdStr = (string)element + .Elements(W.abstractNumId) + .First() + .Attribute(W.val); + if (!int.TryParse(abstractNumIdStr, out var abstractNumId)) + { + throw new DocumentBuilderException("Invalid document - invalid value for abstractNumId"); + } + + var abstractElement = oldNumbering + .Descendants() + .Elements(W.abstractNum) + .First(p => (int)p.Attribute(W.abstractNumId) == abstractNumId); + var nsidElement = abstractElement + .Element(W.nsid); + string? abstractNSID = null; + if (nsidElement != null) + { + abstractNSID = (string)nsidElement .Attribute(W.val); - if (!int.TryParse(abstractNumIdStr, out var abstractNumId)) - { - throw new DocumentBuilderException("Invalid document - invalid value for abstractNumId"); - } + } - var abstractElement = oldNumbering - .Descendants() - .Elements(W.abstractNum) -.First(p => (int)p.Attribute(W.abstractNumId) == abstractNumId); - var nsidElement = abstractElement - .Element(W.nsid); - string? abstractNSID = null; - if (nsidElement != null) + var newAbstractElement = newNumbering + .Descendants() + .Elements(W.abstractNum) + .Where(e => e.Annotation() == null) + .FirstOrDefault(p => { - abstractNSID = (string)nsidElement - .Attribute(W.val); - } - - var newAbstractElement = newNumbering - .Descendants() - .Elements(W.abstractNum) - .Where(e => e.Annotation() == null) -.FirstOrDefault(p => + var thisNsidElement = p.Element(W.nsid); + if (thisNsidElement == null) { - var thisNsidElement = p.Element(W.nsid); - if (thisNsidElement == null) - { - return false; - } - - return (string)thisNsidElement.Attribute(W.val) == abstractNSID; - }); - if (newAbstractElement == null) - { - newAbstractElement = new XElement(abstractElement); - newAbstractElement.Attribute(W.abstractNumId).Value = abstractNumber.ToString(); - abstractNumber++; - if (newNumbering.Root.Elements(W.abstractNum).Any()) - { - newNumbering.Root.Elements(W.abstractNum).Last().AddAfterSelf(newAbstractElement); - } - else - { - newNumbering.Root.Add(newAbstractElement); + return false; } - foreach (var pictId in newAbstractElement.Descendants(W.lvlPicBulletId)) - { - var bulletId = (string)pictId.Attribute(W.val); - var numPicBullet = oldNumbering - .Descendants(W.numPicBullet) - .FirstOrDefault(d => (string)d.Attribute(W.numPicBulletId) == bulletId); - var maxNumPicBulletId = new int[] { -1 }.Concat( - newNumbering.Descendants(W.numPicBullet) - .Attributes(W.numPicBulletId) - .Select(a => (int)a)) - .Max() + 1; - var newNumPicBullet = new XElement(numPicBullet); - newNumPicBullet.Attribute(W.numPicBulletId).Value = maxNumPicBulletId.ToString(); - pictId.Attribute(W.val).Value = maxNumPicBulletId.ToString(); - newNumbering.Root.AddFirst(newNumPicBullet); - } - } - var newAbstractId = newAbstractElement.Attribute(W.abstractNumId).Value; - - // Copy numbering element, if necessary (use matching element with no overrides) - XElement newElement; - if (numIdMap.ContainsKey(numId)) + return (string)thisNsidElement.Attribute(W.val) == abstractNSID; + }); + if (newAbstractElement == null) + { + newAbstractElement = new XElement(abstractElement); + newAbstractElement.Attribute(W.abstractNumId).Value = abstractNumber.ToString(); + abstractNumber++; + if (newNumbering.Root.Elements(W.abstractNum).Any()) { - newElement = newNumbering - .Descendants() - .Elements(W.num) - .Where(e => e.Annotation() == null) -.First(p => (int)p.Attribute(W.numId) == numIdMap[numId]); + newNumbering.Root.Elements(W.abstractNum).Last().AddAfterSelf(newAbstractElement); } else { - newElement = new XElement(element); - newElement - .Elements(W.abstractNumId) - .First() - .Attribute(W.val).Value = newAbstractId; - newElement.Attribute(W.numId).Value = number.ToString(); - numIdMap.Add(numId, number); - number++; - newNumbering.Root.Add(newElement); + newNumbering.Root.Add(newAbstractElement); } - idElement.Attribute(W.val).Value = newElement.Attribute(W.numId).Value; + + foreach (var pictId in newAbstractElement.Descendants(W.lvlPicBulletId)) + { + var bulletId = (string)pictId.Attribute(W.val); + var numPicBullet = oldNumbering + .Descendants(W.numPicBullet) + .FirstOrDefault(d => (string)d.Attribute(W.numPicBulletId) == bulletId); + var maxNumPicBulletId = new int[] { -1 }.Concat( + newNumbering.Descendants(W.numPicBullet) + .Attributes(W.numPicBulletId) + .Select(a => (int)a)) + .Max() + 1; + var newNumPicBullet = new XElement(numPicBullet); + newNumPicBullet.Attribute(W.numPicBulletId).Value = maxNumPicBulletId.ToString(); + pictId.Attribute(W.val).Value = maxNumPicBulletId.ToString(); + newNumbering.Root.AddFirst(newNumPicBullet); + } + } + var newAbstractId = newAbstractElement.Attribute(W.abstractNumId).Value; + + // Copy numbering element, if necessary (use matching element with no overrides) + XElement newElement; + if (numIdMap.ContainsKey(numId)) + { + newElement = newNumbering + .Descendants() + .Elements(W.num) + .Where(e => e.Annotation() == null) + .First(p => (int)p.Attribute(W.numId) == numIdMap[numId]); + } + else + { + newElement = new XElement(element); + newElement + .Elements(W.abstractNumId) + .First() + .Attribute(W.val).Value = newAbstractId; + newElement.Attribute(W.numId).Value = number.ToString(); + numIdMap.Add(numId, number); + number++; + newNumbering.Root.Add(newElement); } + idElement.Attribute(W.val).Value = newElement.Attribute(W.numId).Value; } } if (newNumbering != null) diff --git a/OpenXmlPowerTools/UnicodeMapper.cs b/OpenXmlPowerTools/UnicodeMapper.cs index ffc42716..ee4c46ae 100644 --- a/OpenXmlPowerTools/UnicodeMapper.cs +++ b/OpenXmlPowerTools/UnicodeMapper.cs @@ -60,7 +60,10 @@ public static string RunToString(XElement element) // For w:t elements, we obviously want the element's value. if (element.Name == W.t) { - return (string)element; + // Emulate Word's handling of the xml:space attribute on text elements + XAttribute? spaceAttribute = element.Attribute(XNamespace.Xml + "space"); + string? space = spaceAttribute?.Value; + return NormalizeWhitespace((string) element, space == "preserve"); } // Turn elements representing special characters into their corresponding @@ -141,6 +144,50 @@ public static string RunToString(XElement element) return StartOfHeading.ToString(); } + /// + /// Emulate the way Word interprets the content of text elements + /// depending on whether the xml:space="preserve" attribute is present. + /// + /// The entire content of the w:t element. + /// The corresponding text string Word would display, print, save, + /// and allow to be edited. + private static string NormalizeWhitespace(string text, bool preserve) + { + if (string.IsNullOrEmpty(text)) + return string.Empty; + // Trim leading & trailing whitespace when NOT preserving + ReadOnlySpan span = preserve + ? text.AsSpan() + : text.AsSpan().Trim(); + if (span.Length == 0) + return string.Empty; + var sb = new System.Text.StringBuilder(span.Length); + int i = 0; + while (i < span.Length) + { + char c = span[i]; + switch (c) + { + case '\r': // CR or CRLF → space + sb.Append(' '); + if (i + 1 < span.Length && span[i + 1] == '\n') + i++; // skip LF so CRLF becomes one space + break; + case '\n': // LF → space + sb.Append(' '); + break; + case '\t': // TAB preserved or converted to space, depending on mode + sb.Append(preserve ? c : ' '); + break; + default: // SPACE or any other character → preserved exactly + sb.Append(c); + break; + } + i++; + } + return sb.ToString(); + } + /// /// Translate a symbol into a Unicode character, using the specified w:font attribute /// value and unicode value (represented by the w:sym element's w:char attribute), diff --git a/OpenXmlPowerToolsExamples/MarkupSimplifierApp/MarkupSimplifierApp.csproj b/OpenXmlPowerToolsExamples/MarkupSimplifierApp/MarkupSimplifierApp.csproj index c9e62b42..df5c1ef1 100644 --- a/OpenXmlPowerToolsExamples/MarkupSimplifierApp/MarkupSimplifierApp.csproj +++ b/OpenXmlPowerToolsExamples/MarkupSimplifierApp/MarkupSimplifierApp.csproj @@ -9,7 +9,7 @@ - + diff --git a/TestFiles/DA240-Whitespace.docx b/TestFiles/DA240-Whitespace.docx new file mode 100644 index 00000000..84f2b228 Binary files /dev/null and b/TestFiles/DA240-Whitespace.docx differ diff --git a/TestFiles/DA240-Whitespace.xml b/TestFiles/DA240-Whitespace.xml new file mode 100644 index 00000000..7c21bac1 --- /dev/null +++ b/TestFiles/DA240-Whitespace.xml @@ -0,0 +1,7 @@ + + + may or may not + / + , + and + diff --git a/TestFiles/DA268-Block-Conditional-In-Table-Cell.docx b/TestFiles/DA268-Block-Conditional-In-Table-Cell.docx new file mode 100644 index 00000000..7796dabb Binary files /dev/null and b/TestFiles/DA268-Block-Conditional-In-Table-Cell.docx differ diff --git a/TestFiles/DA268-data.xml b/TestFiles/DA268-data.xml new file mode 100644 index 00000000..10750a67 --- /dev/null +++ b/TestFiles/DA268-data.xml @@ -0,0 +1,4 @@ + + + False + \ No newline at end of file diff --git a/TestFiles/DB012a-No-Numbering0.docx b/TestFiles/DB012a-No-Numbering0.docx new file mode 100644 index 00000000..f44d133a Binary files /dev/null and b/TestFiles/DB012a-No-Numbering0.docx differ diff --git a/TestFiles/DB012a-No-Numbering1.docx b/TestFiles/DB012a-No-Numbering1.docx new file mode 100644 index 00000000..685041a4 Binary files /dev/null and b/TestFiles/DB012a-No-Numbering1.docx differ diff --git a/TestFiles/UM-Whitespace-Word-saved.docx b/TestFiles/UM-Whitespace-Word-saved.docx new file mode 100644 index 00000000..8e590c02 Binary files /dev/null and b/TestFiles/UM-Whitespace-Word-saved.docx differ diff --git a/TestFiles/UM-Whitespace-test.docx b/TestFiles/UM-Whitespace-test.docx new file mode 100644 index 00000000..c72ae3f1 Binary files /dev/null and b/TestFiles/UM-Whitespace-test.docx differ