Skip to content

Commit fdd4a37

Browse files
committed
Fix crash in Frame.__str__ on partial UTF-8 sequences.
Fix #1695.
1 parent f518d15 commit fdd4a37

3 files changed

Lines changed: 103 additions & 35 deletions

File tree

docs/project/changelog.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ Improvements
4040

4141
* Added wheels for ARMv7, PowerPC, RISC-V, and S/390.
4242

43+
Bug fixes
44+
.........
45+
46+
* Prevented ``Frame.__str__`` from crashing when a text frame is fragmented in
47+
the middle of a UTF-8 sequence.
48+
4349
.. _16.0:
4450

4551
16.0

src/websockets/frames.py

Lines changed: 84 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -147,55 +147,105 @@ class Frame:
147147
# Configure if you want to see more in logs. Should be a multiple of 3.
148148
MAX_LOG_SIZE = int(os.environ.get("WEBSOCKETS_MAX_LOG_SIZE", "75"))
149149

150+
DEFAULT_IS_TEXT = {OP_TEXT: True, OP_BINARY: False, OP_CLOSE: True}
151+
150152
def __str__(self) -> str:
151153
"""
152154
Return a human-readable representation of a frame.
153155
156+
This function is intended for logging and debugging. It doesn't aim to
157+
support round-tripping because payloads can be too long for displaying
158+
conveniently. Instead, it shows the beginning and the end. It's robust
159+
to incorrect data.
160+
161+
It attempts to decode UTF-8 payloads whenever possible, even for binary
162+
frames and control frames, because those frequently contain UTF-8 data.
163+
It applies the same logic to continuation frames, because we don't know
164+
if they continue a text frame or a binary frame.
165+
154166
"""
155-
coding = None
167+
expect_text = self.DEFAULT_IS_TEXT.get(self.opcode)
168+
data_repr, is_text = self._data_repr()
169+
170+
data_type = "" if expect_text == is_text else ("text" if is_text else "binary")
156171
length = f"{len(self.data)} byte{'' if len(self.data) == 1 else 's'}"
157172
non_final = "" if self.fin else "continued"
173+
metadata = ", ".join(filter(None, [data_type, length, non_final]))
174+
175+
return f"{self.opcode.name} {data_repr} [{metadata}]"
176+
177+
def _data_repr(self) -> tuple[str, bool | None]:
178+
"""
179+
Return a human-readable representation of the payload.
180+
181+
Also returns whether the payload is text.
182+
183+
The representation is elided to fit ``MAX_LOG_SIZE``.
184+
185+
This is a helper for the __str__ method.
158186
159-
if self.opcode is OP_TEXT:
160-
# Decoding only the beginning and the end is needlessly hard.
161-
# Decode the entire payload then elide later if necessary.
162-
data = repr(bytes(self.data).decode())
163-
elif self.opcode is OP_BINARY:
164-
# We'll show at most the first 16 bytes and the last 8 bytes.
165-
# Encode just what we need, plus two dummy bytes to elide later.
187+
"""
188+
if not self.data:
189+
return "''", self.DEFAULT_IS_TEXT.get(self.opcode)
190+
191+
# Special case for close frames: parse close code and reason.
192+
# Fall back to the standard case if the payload is malformed.
193+
194+
if self.opcode is OP_CLOSE:
195+
try:
196+
return str(Close.parse(self.data)), True
197+
except (ProtocolError, UnicodeDecodeError):
198+
pass
199+
200+
# Guess whether the payload is UTF-8 or binary, regardless of opcode, to
201+
# display UTF-8 text in binary frames nicely and generally to be helpful
202+
# and robust. Also support frames fragmented within UTF-8 sequences.
203+
204+
if len(self.data) > 4 * self.MAX_LOG_SIZE:
205+
# Process only the start and the end, as the middle will be elided.
206+
# Cast to bytes because self.data could be a memoryview.
207+
data_start = bytes(self.data[: 8 * self.MAX_LOG_SIZE // 3])
208+
data_end = bytes(self.data[-4 * self.MAX_LOG_SIZE // 3 :])
209+
is_text = is_utf8_fragment(
210+
data_start,
211+
must_start_clean=self.opcode != OP_CONT,
212+
) and is_utf8_fragment(
213+
data_end,
214+
must_end_clean=self.fin,
215+
)
216+
if is_text:
217+
# Cast to bytes before decoding because data could be a memoryview.
218+
data_repr = repr((data_start + data_end).decode(errors="replace"))
219+
220+
else:
221+
# Cast to bytes because self.data could be a memoryview.
222+
data = bytes(self.data)
223+
is_text = is_utf8_fragment(
224+
data,
225+
must_start_clean=self.opcode != OP_CONT,
226+
must_end_clean=self.fin,
227+
)
228+
if is_text:
229+
data_repr = repr(data.decode(errors="replace"))
230+
231+
# When the payload is text (except perhaps for boundaries), we decoded
232+
# enough in ``data_repr``. Now, do the same for binary payloads.
233+
234+
if not is_text:
166235
binary = self.data
167236
if len(binary) > self.MAX_LOG_SIZE // 3:
168237
cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8
238+
# Encode two dummy bytes to force eliding and adding an ellipsis.
169239
binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
170-
data = " ".join(f"{byte:02x}" for byte in binary)
171-
elif self.opcode is OP_CLOSE:
172-
data = str(Close.parse(self.data))
173-
elif self.data:
174-
# We don't know if a Continuation frame contains text or binary.
175-
# Ping and Pong frames could contain UTF-8.
176-
# Attempt to decode as UTF-8 and display it as text; fallback to
177-
# binary. If self.data is a memoryview, it has no decode() method,
178-
# which raises AttributeError.
179-
try:
180-
data = repr(bytes(self.data).decode())
181-
coding = "text"
182-
except (UnicodeDecodeError, AttributeError):
183-
binary = self.data
184-
if len(binary) > self.MAX_LOG_SIZE // 3:
185-
cut = (self.MAX_LOG_SIZE // 3 - 1) // 3 # by default cut = 8
186-
binary = b"".join([binary[: 2 * cut], b"\x00\x00", binary[-cut:]])
187-
data = " ".join(f"{byte:02x}" for byte in binary)
188-
coding = "binary"
189-
else:
190-
data = "''"
240+
data_repr = " ".join(f"{byte:02x}" for byte in binary)
191241

192-
if len(data) > self.MAX_LOG_SIZE:
193-
cut = self.MAX_LOG_SIZE // 3 - 1 # by default cut = 24
194-
data = data[: 2 * cut] + "..." + data[-cut:]
242+
# Elide the middle of the representation to fit the maximum log size.
195243

196-
metadata = ", ".join(filter(None, [coding, length, non_final]))
244+
if len(data_repr) > self.MAX_LOG_SIZE:
245+
cut = self.MAX_LOG_SIZE // 3 - 1 # by default cut = 24
246+
data_repr = data_repr[: 2 * cut] + "..." + data_repr[-cut:]
197247

198-
return f"{self.opcode.name} {data} [{metadata}]"
248+
return data_repr, is_text
199249

200250
@classmethod
201251
def parse(

tests/test_frames.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def test_extensions(self):
174174
class Rot13:
175175
@staticmethod
176176
def encode(frame):
177-
assert frame.opcode == OP_TEXT
177+
assert frame.opcode is OP_TEXT
178178
text = frame.data.decode()
179179
data = codecs.encode(text, "rot13").encode()
180180
return dataclasses.replace(frame, data=data)
@@ -229,6 +229,12 @@ def test_cont_final_binary_from_memoryview(self):
229229
"CONT fc fd fe ff [binary, 4 bytes]",
230230
)
231231

232+
def test_cont_text_fragmented(self):
233+
self.assertEqual(
234+
str(Frame(OP_CONT, b"\xa9caf\xc3", fin=False)),
235+
"CONT '�caf�' [text, 5 bytes, continued]",
236+
)
237+
232238
def test_cont_text_truncated(self):
233239
self.assertEqual(
234240
str(Frame(OP_CONT, b"caf\xc3\xa9 " * 16, fin=False)),
@@ -262,6 +268,12 @@ def test_text_non_final(self):
262268
"TEXT 'café' [5 bytes, continued]",
263269
)
264270

271+
def test_text_fragmented(self):
272+
self.assertEqual(
273+
str(Frame(OP_TEXT, b"caf\xc3", fin=False)),
274+
"TEXT 'caf�' [4 bytes, continued]",
275+
)
276+
265277
def test_text_truncated(self):
266278
self.assertEqual(
267279
str(Frame(OP_TEXT, b"caf\xc3\xa9 " * 16)),

0 commit comments

Comments
 (0)