-
Notifications
You must be signed in to change notification settings - Fork 48
Bug 1937801 - Implement mechanism to use caches for common tools in run transforms #623
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9d54964
eed5c39
06f23d0
a6a3ac5
e77d38e
7045aa5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,10 +7,12 @@ | |
| consistency. | ||
| """ | ||
|
|
||
| import hashlib | ||
| import json | ||
| from typing import Any, Dict, List, Union | ||
|
|
||
| from taskgraph.transforms.base import TransformConfig | ||
| from taskgraph.util import path | ||
| from taskgraph.util.caches import CACHES, get_checkout_dir | ||
| from taskgraph.util.taskcluster import get_artifact_prefix | ||
|
|
||
|
|
||
|
|
@@ -32,10 +34,10 @@ def add_cache(task, taskdesc, name, mount_point, skip_untrusted=False): | |
| skip_untrusted (bool): Whether cache is used in untrusted environments | ||
| (default: False). Only applies to docker-worker. | ||
| """ | ||
| if not task["run"].get("use-caches", True): | ||
| return | ||
|
|
||
| worker = task["worker"] | ||
| if worker["implementation"] not in ("docker-worker", "generic-worker"): | ||
| # caches support not implemented | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems a bit odd to me that |
||
| return | ||
|
|
||
| if worker["implementation"] == "docker-worker": | ||
| taskdesc["worker"].setdefault("caches", []).append( | ||
|
|
@@ -55,10 +57,6 @@ def add_cache(task, taskdesc, name, mount_point, skip_untrusted=False): | |
| } | ||
| ) | ||
|
|
||
| else: | ||
| # Caches not implemented | ||
| pass | ||
|
|
||
|
|
||
| def add_artifacts(config, task, taskdesc, path): | ||
| taskdesc["worker"].setdefault("artifacts", []).append( | ||
|
|
@@ -92,46 +90,19 @@ def support_vcs_checkout(config, task, taskdesc, repo_configs, sparse=False): | |
| reserved for ``run-task`` tasks. | ||
| """ | ||
| worker = task["worker"] | ||
| is_mac = worker["os"] == "macosx" | ||
| assert worker["os"] in ("linux", "macosx", "windows") | ||
| is_win = worker["os"] == "windows" | ||
| is_linux = worker["os"] == "linux" | ||
| is_docker = worker["implementation"] == "docker-worker" | ||
| assert is_mac or is_win or is_linux | ||
|
|
||
| checkoutdir = get_checkout_dir(task) | ||
| if is_win: | ||
| checkoutdir = "build" | ||
| hgstore = "y:/hg-shared" | ||
| elif is_docker: | ||
| checkoutdir = "{workdir}/checkouts".format(**task["run"]) | ||
| hgstore = f"{checkoutdir}/hg-store" | ||
| else: | ||
| checkoutdir = "checkouts" | ||
| hgstore = f"{checkoutdir}/hg-shared" | ||
|
|
||
| vcsdir = f"{checkoutdir}/{get_vcsdir_name(worker['os'])}" | ||
| cache_name = "checkouts" | ||
|
|
||
| # Robust checkout does not clean up subrepositories, so ensure that tasks | ||
| # that checkout different sets of paths have separate caches. | ||
| # See https://bugzilla.mozilla.org/show_bug.cgi?id=1631610 | ||
| if len(repo_configs) > 1: | ||
| checkout_paths = { | ||
| "\t".join([repo_config.path, repo_config.prefix]) | ||
| for repo_config in sorted( | ||
| repo_configs.values(), key=lambda repo_config: repo_config.path | ||
| ) | ||
| } | ||
| checkout_paths_str = "\n".join(checkout_paths).encode("utf-8") | ||
| digest = hashlib.sha256(checkout_paths_str).hexdigest() | ||
| cache_name += f"-repos-{digest}" | ||
|
|
||
| # Sparse checkouts need their own cache because they can interfere | ||
| # with clients that aren't sparse aware. | ||
| if sparse: | ||
| cache_name += "-sparse" | ||
|
|
||
| add_cache(task, taskdesc, cache_name, checkoutdir) | ||
|
|
||
| env = taskdesc["worker"].setdefault("env", {}) | ||
| env.update( | ||
| { | ||
|
|
@@ -166,3 +137,67 @@ def support_vcs_checkout(config, task, taskdesc, repo_configs, sparse=False): | |
| taskdesc["worker"]["taskcluster-proxy"] = True | ||
|
|
||
| return vcsdir | ||
|
|
||
|
|
||
| def should_use_cache( | ||
| name: str, | ||
| use_caches: Union[bool, List[str]], | ||
| has_checkout: bool, | ||
| ) -> bool: | ||
| # Never enable the checkout cache if there's no clone. This allows | ||
| # 'checkout' to be specified as a default cache without impacting | ||
| # irrelevant tasks. | ||
| if name == "checkout" and not has_checkout: | ||
| return False | ||
|
|
||
| if isinstance(use_caches, bool): | ||
| return use_caches | ||
|
|
||
| return name in use_caches | ||
|
|
||
|
|
||
| def support_caches( | ||
| config: TransformConfig, task: Dict[str, Any], taskdesc: Dict[str, Any] | ||
| ): | ||
| """Add caches for common tools.""" | ||
| run = task["run"] | ||
| worker = task["worker"] | ||
| workdir = run.get("workdir") | ||
| base_cache_dir = ".task-cache" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's probably fine to put these caches outside of their default place, and obviously it worked fine with all your testing with Gecko. I anticipate that some edge case tasks or issues will come up though (eg: scripts or other things that are trying to either pull things from a cache, or inject them, and not paying attention to the env vars when doing so). That's not something that ought to stop this work, but it may cause unexpected bustage when this is picked up in various places.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call out! The reason I couldn't use Another issue is that the default place is different for every platform. So we would need logic in Taskgraph that detects which platform the task is running and tries to set the default place accordingly per tool. This is further complicated by the fact that under Windows the default cache dir is under |
||
| if worker["implementation"] == "docker-worker": | ||
| workdir = workdir or "/builds/worker" | ||
| base_cache_dir = path.join(workdir, base_cache_dir) | ||
|
|
||
| use_caches = run.get("use-caches") | ||
| if use_caches is None: | ||
| # Use project default values for filtering caches, default to | ||
| # checkout cache if no selection is specified. | ||
| use_caches = ( | ||
| config.graph_config.get("taskgraph", {}) | ||
| .get("run", {}) | ||
| .get("use-caches", ["checkout"]) | ||
| ) | ||
|
|
||
| for name, cache_cfg in CACHES.items(): | ||
| if not should_use_cache(name, use_caches, run["checkout"]): | ||
| continue | ||
|
|
||
| if "cache_dir" in cache_cfg: | ||
| assert callable(cache_cfg["cache_dir"]) | ||
| cache_dir = cache_cfg["cache_dir"](task) | ||
| else: | ||
| cache_dir = f"{base_cache_dir}/{name}" | ||
|
|
||
| if "cache_name" in cache_cfg: | ||
| assert callable(cache_cfg["cache_name"]) | ||
| cache_name = cache_cfg["cache_name"](config, task) | ||
| else: | ||
| cache_name = name | ||
|
|
||
| if cache_cfg.get("env"): | ||
| env = taskdesc["worker"].setdefault("env", {}) | ||
| # If cache_dir is already absolute, the `.join` call returns it as | ||
| # is. In that case, {task_workdir} will get interpolated by | ||
| # run-task. | ||
| env[cache_cfg["env"]] = path.join("{task_workdir}", cache_dir) | ||
| add_cache(task, taskdesc, cache_name, cache_dir) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| # This Source Code Form is subject to the terms of the Mozilla Public | ||
| # License, v. 2.0. If a copy of the MPL was not distributed with this | ||
| # file, You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
|
||
| import hashlib | ||
| from typing import TYPE_CHECKING, Any, Dict | ||
|
|
||
| if TYPE_CHECKING: | ||
| from taskgraph.transforms.base import TransformConfig | ||
|
|
||
|
|
||
| def get_checkout_dir(task: Dict[str, Any]) -> str: | ||
| worker = task["worker"] | ||
| if worker["os"] == "windows": | ||
| return "build" | ||
| elif worker["implementation"] == "docker-worker": | ||
| return f"{task['run']['workdir']}/checkouts" | ||
| else: | ||
| return "checkouts" | ||
|
|
||
|
|
||
| def get_checkout_cache_name(config: "TransformConfig", task: Dict[str, Any]) -> str: | ||
| repo_configs = config.repo_configs | ||
| cache_name = "checkouts" | ||
|
|
||
| # Robust checkout does not clean up subrepositories, so ensure that tasks | ||
| # that checkout different sets of paths have separate caches. | ||
| # See https://bugzilla.mozilla.org/show_bug.cgi?id=1631610 | ||
| if len(repo_configs) > 1: | ||
| checkout_paths = { | ||
| "\t".join([repo_config.path, repo_config.prefix]) | ||
| for repo_config in sorted( | ||
| repo_configs.values(), key=lambda repo_config: repo_config.path | ||
| ) | ||
| } | ||
| checkout_paths_str = "\n".join(checkout_paths).encode("utf-8") | ||
| digest = hashlib.sha256(checkout_paths_str).hexdigest() | ||
| cache_name += f"-repos-{digest}" | ||
|
|
||
| # Sparse checkouts need their own cache because they can interfere | ||
| # with clients that aren't sparse aware. | ||
| if task["run"]["sparse-profile"]: | ||
| cache_name += "-sparse" | ||
|
|
||
| return cache_name | ||
|
|
||
|
|
||
| CACHES = { | ||
| "cargo": {"env": "CARGO_HOME"}, | ||
| "checkout": { | ||
| "cache_dir": get_checkout_dir, | ||
| "cache_name": get_checkout_cache_name, | ||
| }, | ||
| "npm": {"env": "npm_config_cache"}, | ||
| "pip": {"env": "PIP_CACHE_DIR"}, | ||
| "uv": {"env": "UV_CACHE_DIR"}, | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Including a block (here here or in a linked doc) about exactly what to add to a task to set up a new cache would be useful (I don't think it's obvious to all readers).
This also makes me wonder if it would be beneficial to allow additional caches to be added in the project-wide or task configuration (so that the mounts would be set up automatically). This is firmly in the category of "future enhancement", though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I think that makes sense. It should be possible by monkeypatching the CACHES dict, but we might want some kind of registry / decorator thing like we have for other things.