From 75ff3b7536f5f75af98d346802865cb4a33c3eaf Mon Sep 17 00:00:00 2001 From: Bastien Orivel Date: Mon, 2 Jun 2025 11:56:43 +0200 Subject: [PATCH] Don't decode YAML responses before parsing them in get_artifact Taskcluster sends those without a content-type which means that requests assumes they're latin1 text, which they might not be as taskgraph uploads its YAMLs encoded as UTF-8. While decoding UTF-8 as latin1 works most of the time, if the commit message passed in a parameters.yml contains a special character, then it will return a garbled mess which the yaml library will reject. By passing the raw bytes instead of decoding them, we sidestep the problem entirely, delegating the decoding to the YAML library which supports both utf-8 and utf-16s. From the documentation for `yaml.load` > A byte string or a file must be encoded with utf-8, utf-16-be or > utf-16-le encoding. yaml.load detects the encoding by checking the BOM > (byte order mark) sequence at the beginning of the string/file. If no > BOM is present, the utf-8 encoding is assumed. --- src/taskgraph/util/taskcluster.py | 2 +- test/test_util_taskcluster.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/taskgraph/util/taskcluster.py b/src/taskgraph/util/taskcluster.py index fdc9ce414..cebc4e5da 100644 --- a/src/taskgraph/util/taskcluster.py +++ b/src/taskgraph/util/taskcluster.py @@ -143,7 +143,7 @@ def _handle_artifact(path, response): if path.endswith(".json"): return response.json() if path.endswith(".yml"): - return yaml.load_stream(response.text) + return yaml.load_stream(response.content) response.raw.read = functools.partial(response.raw.read, decode_content=True) return response.raw diff --git a/test/test_util_taskcluster.py b/test/test_util_taskcluster.py index f79b2a428..39f604a78 100644 --- a/test/test_util_taskcluster.py +++ b/test/test_util_taskcluster.py @@ -154,6 +154,21 @@ def test_get_artifact(responses, root_url): ) assert tc.get_artifact(tid, "artifact.yml") == {"foo": "bar"} + responses.add( + responses.GET, + f"{root_url}/api/queue/v1/task/{tid}/artifacts/artifact.yml", + body=b"foo: \xe2\x81\x83", + ) + assert tc.get_artifact(tid, "artifact.yml") == {"foo": b"\xe2\x81\x83".decode()} + + responses.add( + responses.GET, + f"{root_url}/api/queue/v1/task/{tid}/artifacts/artifact.yml", + body=b"foo: \xe2\x81\x83".decode().encode("utf-16"), + headers={"Content-Type": "text/yaml; charset=utf-16"}, + ) + assert tc.get_artifact(tid, "artifact.yml") == {"foo": b"\xe2\x81\x83".decode()} + def test_list_artifact(responses, root_url): tid = 123