diff --git a/AUTHORS b/AUTHORS index b72d0136484..c12d321a9e2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -428,6 +428,7 @@ Samuele Pedroni Sanket Duthade Sankt Petersbug Saravanan Padmanaban +Sean Doherty Sean Malloy Segev Finer Serhii Mozghovyi diff --git a/changelog/14483.bugfix.rst b/changelog/14483.bugfix.rst new file mode 100644 index 00000000000..f8c9b96c1f4 --- /dev/null +++ b/changelog/14483.bugfix.rst @@ -0,0 +1 @@ +Preserved valid supplementary-plane Unicode characters, such as emoji, in JUnit XML output instead of visually escaping them as invalid XML. diff --git a/src/_pytest/junitxml.py b/src/_pytest/junitxml.py index ae8d2b94d36..30c93c88c07 100644 --- a/src/_pytest/junitxml.py +++ b/src/_pytest/junitxml.py @@ -55,9 +55,7 @@ def repl(matchobj: re.Match[str]) -> str: # The spec range of valid chars is: # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] # For an unknown(?) reason, we disallow #x7F (DEL) as well. - illegal_xml_re = ( - "[^\u0009\u000a\u000d\u0020-\u007e\u0080-\ud7ff\ue000-\ufffd\u10000-\u10ffff]" - ) + illegal_xml_re = "[^\u0009\u000a\u000d\u0020-\u007e\u0080-\ud7ff\ue000-\ufffd\U00010000-\U0010ffff]" return re.sub(illegal_xml_re, repl, str(arg)) diff --git a/testing/test_junitxml.py b/testing/test_junitxml.py index 5a603c05bc8..ce3e731d45b 100644 --- a/testing/test_junitxml.py +++ b/testing/test_junitxml.py @@ -1104,11 +1104,6 @@ def test_invalid_xml_escape() -> None: # Test some more invalid xml chars, the full range should be # tested really but let's just test the edges of the ranges # instead. - # XXX This only tests low unicode character points for now as - # there are some issues with the testing infrastructure for - # the higher ones. - # XXX Testing 0xD (\r) is tricky as it overwrites the just written - # line in the output, so we skip it too. invalid = ( 0x00, 0x1, @@ -1122,8 +1117,18 @@ def test_invalid_xml_escape() -> None: 0xFFFE, 0x0FFFF, ) # , 0x110000) - valid = (0x9, 0xA, 0x20) - # 0xD, 0xD7FF, 0xE000, 0xFFFD, 0x10000, 0x10FFFF) + valid = ( + 0x9, + 0xA, + 0xD, + 0x20, + 0xD7FF, + 0xE000, + 0xFFFD, + 0x10000, + 0x1F600, + 0x10FFFF, + ) for i in invalid: got = bin_xml_escape(chr(i))