From f1f0184ef40cc7ce50090901da61019a8b5522f0 Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Fri, 10 Jan 2025 16:55:04 +0900 Subject: [PATCH 1/2] Remove wrong arguments given to create_context_tar The fourth argument is args, meant to be a dict. --- test/test_util_docker.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_util_docker.py b/test/test_util_docker.py index 18034bbec..7b03da33e 100644 --- a/test/test_util_docker.py +++ b/test/test_util_docker.py @@ -98,7 +98,7 @@ def test_create_context_tar_basic(self): os.chmod(os.path.join(d, "extra"), MODE_STANDARD) tp = os.path.join(tmp, "tar") - h = docker.create_context_tar(tmp, d, tp, "my_image") + h = docker.create_context_tar(tmp, d, tp) self.assertEqual( h, "6c1cc23357625f64f775a08eace7bbc3877dd08d2f3546e0f2e308bac8491865" ) @@ -131,7 +131,7 @@ def test_create_context_topsrcdir_files(self): os.chmod(os.path.join(extra, "file0"), MODE_STANDARD) tp = os.path.join(tmp, "tar") - h = docker.create_context_tar(tmp, d, tp, "test_image") + h = docker.create_context_tar(tmp, d, tp) self.assertEqual( h, "e7f14044b8ec1ba42e251d4b293af212ad08b30ec8ab6613abbdbe73c3c2b61f" ) @@ -158,7 +158,7 @@ def test_create_context_absolute_path(self): fh.write(b"# %include /etc/shadow\n") with self.assertRaisesRegex(Exception, "cannot be absolute"): - docker.create_context_tar(tmp, d, os.path.join(tmp, "tar"), "test") + docker.create_context_tar(tmp, d, os.path.join(tmp, "tar")) finally: shutil.rmtree(tmp) @@ -172,7 +172,7 @@ def test_create_context_outside_topsrcdir(self): fh.write(b"# %include foo/../../../etc/shadow\n") with self.assertRaisesRegex(Exception, "path outside topsrcdir"): - docker.create_context_tar(tmp, d, os.path.join(tmp, "tar"), "test") + docker.create_context_tar(tmp, d, os.path.join(tmp, "tar")) finally: shutil.rmtree(tmp) @@ -186,7 +186,7 @@ def test_create_context_missing_extra(self): fh.write(b"# %include does/not/exist\n") with self.assertRaisesRegex(Exception, "path does not exist"): - docker.create_context_tar(tmp, d, os.path.join(tmp, "tar"), "test") + docker.create_context_tar(tmp, d, os.path.join(tmp, "tar")) finally: shutil.rmtree(tmp) @@ -214,7 +214,7 @@ def test_create_context_extra_directory(self): os.chmod(os.path.join(tmp, "file0"), MODE_STANDARD) tp = os.path.join(tmp, "tar") - h = docker.create_context_tar(tmp, d, tp, "my_image") + h = docker.create_context_tar(tmp, d, tp) self.assertEqual( h, "d2a3363b15d0eb547a6c81a72ddf3980e2f6e6360c29b4fb6818102896f43180" From dffc5e045571a17d64c457a34997991ae80f814d Mon Sep 17 00:00:00 2001 From: Mike Hommey Date: Fri, 10 Jan 2025 17:04:30 +0900 Subject: [PATCH 2/2] Hash the uncompressed contents of docker context tar With an increasing number of docker images, the overhead of hashing the compressed contents of the docker context tars becomes visible when loading the toolchain kind during Firefox builds. But the really interesting part for the hash is the uncompressed contents. Whether they are compressed with gzip at level 9, gzip level 1, bz2 or zstd, it doesn't matter for the resulting docker image. So it makes sense to only hash the uncompressed contents, which is much faster. --- src/taskgraph/docker.py | 2 +- src/taskgraph/util/archive.py | 24 +++++++++++++++++++----- src/taskgraph/util/docker.py | 20 ++++++++++---------- test/test_util_docker.py | 12 ++++++------ 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/taskgraph/docker.py b/src/taskgraph/docker.py index fdb2020cb..5f715e327 100644 --- a/src/taskgraph/docker.py +++ b/src/taskgraph/docker.py @@ -122,7 +122,7 @@ def build_image(name, tag, args=None): tag = tag or docker.docker_image(name, by_tag=True) buf = BytesIO() - docker.stream_context_tar(".", image_dir, buf, "", args) + docker.stream_context_tar(".", image_dir, buf, args) cmdargs = ["docker", "image", "build", "--no-cache", "-"] if tag: cmdargs.insert(-1, f"-t={tag}") diff --git a/src/taskgraph/util/archive.py b/src/taskgraph/util/archive.py index b4101ca60..e39119db4 100644 --- a/src/taskgraph/util/archive.py +++ b/src/taskgraph/util/archive.py @@ -7,6 +7,7 @@ import os import stat import tarfile +from contextlib import contextmanager # 2016-01-01T00:00:00+0000 DEFAULT_MTIME = 1451606400 @@ -104,14 +105,14 @@ def create_tar_from_files(fp, files): tf.addfile(ti, f) -def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9): - """Create a tar.gz file deterministically from files. +@contextmanager +def gzip_compressor(fp, filename=None, compresslevel=9): + """Create a deterministic GzipFile writer. - This is a glorified wrapper around ``create_tar_from_files`` that - adds gzip compression. + This is a glorified wrapper around ``GzipFile`` that adds some + determinism. The passed file handle should be opened for writing in binary mode. - When the function returns, all data has been written to the handle. """ # Offset 3-7 in the gzip header contains an mtime. Pin it to a known # value so output is deterministic. @@ -123,4 +124,17 @@ def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9): mtime=DEFAULT_MTIME, ) with gf: + yield gf + + +def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9): + """Create a tar.gz file deterministically from files. + + This is a glorified wrapper around ``create_tar_from_files`` that + adds gzip compression. + + The passed file handle should be opened for writing in binary mode. + When the function returns, all data has been written to the handle. + """ + with gzip_compressor(fp, filename, compresslevel) as gf: create_tar_from_files(gf, files) diff --git a/src/taskgraph/util/docker.py b/src/taskgraph/util/docker.py index e362aece3..88c5d662e 100644 --- a/src/taskgraph/util/docker.py +++ b/src/taskgraph/util/docker.py @@ -10,7 +10,7 @@ import re from typing import Optional -from taskgraph.util.archive import create_tar_gz_from_files +from taskgraph.util.archive import create_tar_from_files, gzip_compressor IMAGE_DIR = os.path.join(".", "taskcluster", "docker") @@ -76,10 +76,15 @@ class HashingWriter: def __init__(self, writer): self._hash = hashlib.sha256() self._writer = writer + self._written = 0 def write(self, buf): self._hash.update(buf) self._writer.write(buf) + self._written += len(buf) + + def tell(self): + return self._written def hexdigest(self): return self._hash.hexdigest() @@ -108,13 +113,8 @@ def create_context_tar(topsrcdir, context_dir, out_path, args=None): Returns the SHA-256 hex digest of the created archive. """ with open(out_path, "wb") as fh: - return stream_context_tar( - topsrcdir, - context_dir, - fh, - image_name=os.path.basename(out_path), - args=args, - ) + with gzip_compressor(fh, filename=os.path.basename(out_path)) as gf: + return stream_context_tar(topsrcdir, context_dir, gf, args=args) RUN_TASK_ROOT = os.path.join(os.path.dirname(os.path.dirname(__file__)), "run-task") @@ -135,7 +135,7 @@ def create_context_tar(topsrcdir, context_dir, out_path, args=None): ] -def stream_context_tar(topsrcdir, context_dir, out_file, image_name=None, args=None): +def stream_context_tar(topsrcdir, context_dir, out_file, args=None): """Like create_context_tar, but streams the tar file to the `out_file` file object.""" archive_files = {} @@ -201,7 +201,7 @@ def stream_context_tar(topsrcdir, context_dir, out_file, image_name=None, args=N archive_files["Dockerfile"] = io.BytesIO("".join(content).encode("utf-8")) writer = HashingWriter(out_file) - create_tar_gz_from_files(writer, archive_files, image_name) + create_tar_from_files(writer, archive_files) return writer.hexdigest() diff --git a/test/test_util_docker.py b/test/test_util_docker.py index 7b03da33e..977f7d624 100644 --- a/test/test_util_docker.py +++ b/test/test_util_docker.py @@ -39,7 +39,7 @@ def test_generate_context_hash(self): docker.generate_context_hash( tmpdir, os.path.join(tmpdir, "docker/my-image"), "my-image" ), - "e1649b3427bd7a0387f4508d25057c2e89228748517aad6c70e3df54f47bd13a", + "ab46d51b191eb6c595cccf1fa02485b4e1decc6ba9737a8b8613038d3661be52", ) finally: shutil.rmtree(tmpdir) @@ -100,7 +100,7 @@ def test_create_context_tar_basic(self): tp = os.path.join(tmp, "tar") h = docker.create_context_tar(tmp, d, tp) self.assertEqual( - h, "6c1cc23357625f64f775a08eace7bbc3877dd08d2f3546e0f2e308bac8491865" + h, "3134fa88c39a604132b260c2c3cf09f6fe4a8234475a4272fd9438aac47caaae" ) # File prefix should be "my_image" @@ -133,7 +133,7 @@ def test_create_context_topsrcdir_files(self): tp = os.path.join(tmp, "tar") h = docker.create_context_tar(tmp, d, tp) self.assertEqual( - h, "e7f14044b8ec1ba42e251d4b293af212ad08b30ec8ab6613abbdbe73c3c2b61f" + h, "56657d2f428fe268cc3b0966649a3bf8477dcd2eade7a1d4accc5c312f075a2f" ) with tarfile.open(tp, "r:gz") as tf: @@ -217,7 +217,7 @@ def test_create_context_extra_directory(self): h = docker.create_context_tar(tmp, d, tp) self.assertEqual( - h, "d2a3363b15d0eb547a6c81a72ddf3980e2f6e6360c29b4fb6818102896f43180" + h, "ba7b62e8f25977e8e6629aee1d121ae92b6007258c90a53fb94c8607e1e96e10" ) with tarfile.open(tp, "r:gz") as tf: @@ -261,11 +261,11 @@ def test_stream_context_tar(self): # file objects are BufferedRandom instances out_file = BufferedRandom(BytesIO(b"")) h = docker.stream_context_tar( - tmp, d, out_file, "my_image", args={"PYTHON_VERSION": "3.8"} + tmp, d, out_file, args={"PYTHON_VERSION": "3.8"} ) self.assertEqual( - h, "e015aabf2677d90fee777c8813fd69402309a2d49bcdff2c28428134a53e36be" + h, "ba7b62e8f25977e8e6629aee1d121ae92b6007258c90a53fb94c8607e1e96e10" ) finally: shutil.rmtree(tmp)