Fix crash in Frame.__str__ on partial UTF-8 sequences.

aaugustin · aaugustin · commit fdd4a37d2ff1 · 2026-05-17T09:23:09.000+02:00
Fix #1695.
diff --git a/docs/project/changelog.rst b/docs/project/changelog.rst
@@ -40,6 +40,12 @@ Improvements
 
 * Added wheels for ARMv7, PowerPC, RISC-V, and S/390.
 
+Bug fixes
+.........
+
+* Prevented ``Frame.__str__`` from crashing when a text frame is fragmented in
+  the middle of a UTF-8 sequence.
+
 .. _16.0:
 
 16.0
diff --git a/src/websockets/frames.py b/src/websockets/frames.py
@@ -147,55 +147,105 @@ class Frame:
     # Configure if you want to see more in logs. Should be a multiple of 3.
     MAX_LOG_SIZE = int(os.environ.get("WEBSOCKETS_MAX_LOG_SIZE", "75"))
 
+    DEFAULT_IS_TEXT = {OP_TEXT: True, OP_BINARY: False, OP_CLOSE: True}
+
     def __str__(self) -> str:
         """
         Return a human-readable representation of a frame.
 
+        This function is intended for logging and debugging. It doesn't aim to
+        support round-tripping because payloads can be too long for displaying
+        conveniently. Instead, it shows the beginning and the end. It's robust
+        to incorrect data.
+
+        It attempts to decode UTF-8 payloads whenever possible, even for binary
+        frames and control frames, because those frequently contain UTF-8 data.
+        It applies the same logic to continuation frames, because we don't know
+        if they continue a text frame or a binary frame.
+
         """
-        coding = None
+        expect_text = self.DEFAULT_IS_TEXT.get(self.opcode)
+        data_repr, is_text = self._data_repr()
+
+        data_type = "" if expect_text == is_text else ("text" if is_text else "binary")
         length = f"{len(self.data)} byte{'' if len(self.data) == 1 else 's'}"
         non_final = "" if self.fin else "continued"
+        metadata = ", ".join(filter(None, [data_type, length, non_final]))
+
+        return f"{self.opcode.name} {data_repr} [{metadata}]"
+
+    def _data_repr(self) -> tuple[str, bool | None]:
+        """
+        Return a human-readable representation of the payload.
+
+        Also returns whether the payload is text.
+
+        The representation is elided to fit ``MAX_LOG_SIZE``.
+
+        This is a helper for the __str__ method.
 
-        if self.opcode is OP_TEXT:
-            # Decoding only the beginning and the end is needlessly hard.
-            # Decode the entire payload then elide later if necessary.
-            data = repr(bytes(self.data).decode())
-        elif self.opcode is OP_BINARY:
-            # We'll show at most the first 16 bytes and the last 8 bytes.
-            # Encode just what we need, plus two dummy bytes to elide later.
+        """
+        if not self.data:
+            return "''", self.DEFAULT_IS_TEXT.get(self.opcode)
+
+        # Special case for close frames: parse close code and reason.
+        # Fall back to the standard case if the payload is malformed.
+
+        if self.opcode is OP_CLOSE:
+            try:
+                return str(Close.parse(self.data)), True
+            except (ProtocolError, UnicodeDecodeError):
+                pass
+
+        # Guess whether the payload is UTF-8 or binary, regardless of opcode, to
+        # display UTF-8 text in binary frames nicely and generally to be helpful
+        # and robust. Also support frames fragmented within UTF-8 sequences.
+
+        if len(self.data) > 4 * self.MAX_LOG_SIZE:
+            # Process only the start and the end, as the middle will be elided.
+            # Cast to bytes because self.data could be a memoryview.
+            data_start = bytes(self.data[: 8 * self.MAX_LOG_SIZE // 3])
+            data_end = bytes(self.data[-4 * self.MAX_LOG_SIZE // 3 :])
+            is_text = is_utf8_fragment(
+                data_start,
+                must_start_clean=self.opcode != OP_CONT,
+            ) and is_utf8_fragment(
+                data_end,
+                must_end_clean=self.fin,
+            )
+            if is_text:
+                # Cast to bytes before decoding because data could be a memoryview.
+                data_repr = repr((data_start + data_end).decode(errors="replace"))
+
+        else:
+            # Cast to bytes because self.data could be a memoryview.
+            data = bytes(self.data)
+            is_text = is_utf8_fragment(
+                data,
+                must_start_clean=self.opcode != OP_CONT,
+                must_end_clean=self.fin,
+            )
+            if is_text:
+                data_repr = repr(data.decode(errors="replace"))
+
+        # When the payload is text (except perhaps for boundaries), we decoded
+        # enough in ``data_repr``. Now, do the same for binary payloads.
+
+        if not is_text:
             binary = self.data
             if len(binary) > self.MAX_LOG_SIZE // 3:
                 cut = (self.MAX_LOG_SIZE // 3 - 1) // 3  # by default cut = 8
+                # Encode two dummy bytes to force eliding and adding an ellipsis.
                 binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
-            data = " ".join(f"{byte:02x}" for byte in binary)
-        elif self.opcode is OP_CLOSE:
-            data = str(Close.parse(self.data))
-        elif self.data:
-            # We don't know if a Continuation frame contains text or binary.
-            # Ping and Pong frames could contain UTF-8.
-            # Attempt to decode as UTF-8 and display it as text; fallback to
-            # binary. If self.data is a memoryview, it has no decode() method,
-            # which raises AttributeError.
-            try:
-                data = repr(bytes(self.data).decode())
-                coding = "text"
-            except (UnicodeDecodeError, AttributeError):
-                binary = self.data
-                if len(binary) > self.MAX_LOG_SIZE // 3:
-                    cut = (self.MAX_LOG_SIZE // 3 - 1) // 3  # by default cut = 8
-                    binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
-                data = " ".join(f"{byte:02x}" for byte in binary)
-                coding = "binary"
-        else:
-            data = "''"
+            data_repr = " ".join(f"{byte:02x}" for byte in binary)
 
-        if len(data) > self.MAX_LOG_SIZE:
-            cut = self.MAX_LOG_SIZE // 3 - 1  # by default cut = 24
-            data = data[: 2 * cut] + "..." + data[-cut:]
+        # Elide the middle of the representation to fit the maximum log size.
 
-        metadata = ", ".join(filter(None, [coding, length, non_final]))
+        if len(data_repr) > self.MAX_LOG_SIZE:
+            cut = self.MAX_LOG_SIZE // 3 - 1  # by default cut = 24
+            data_repr = data_repr[: 2 * cut] + "..." + data_repr[-cut:]
 
-        return f"{self.opcode.name} {data} [{metadata}]"
+        return data_repr, is_text
 
     @classmethod
     def parse(
diff --git a/tests/test_frames.py b/tests/test_frames.py
@@ -174,7 +174,7 @@ def test_extensions(self):
         class Rot13:
             @staticmethod
             def encode(frame):
-                assert frame.opcode == OP_TEXT
+                assert frame.opcode is OP_TEXT
                 text = frame.data.decode()
                 data = codecs.encode(text, "rot13").encode()
                 return dataclasses.replace(frame, data=data)
@@ -229,6 +229,12 @@ def test_cont_final_binary_from_memoryview(self):
             "CONT fc fd fe ff [binary, 4 bytes]",
         )
 
+    def test_cont_text_fragmented(self):
+        self.assertEqual(
+            str(Frame(OP_CONT, b"\xa9caf\xc3", fin=False)),
+            "CONT '�caf�' [text, 5 bytes, continued]",
+        )
+
     def test_cont_text_truncated(self):
         self.assertEqual(
             str(Frame(OP_CONT, b"caf\xc3\xa9 " * 16, fin=False)),
@@ -262,6 +268,12 @@ def test_text_non_final(self):
             "TEXT 'café' [5 bytes, continued]",
         )
 
+    def test_text_fragmented(self):
+        self.assertEqual(
+            str(Frame(OP_TEXT, b"caf\xc3", fin=False)),
+            "TEXT 'caf�' [4 bytes, continued]",
+        )
+
     def test_text_truncated(self):
         self.assertEqual(
             str(Frame(OP_TEXT, b"caf\xc3\xa9 " * 16)),