Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion docs/reference/migrations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,23 @@ Migration Guide

This page can help when migrating Taskgraph across major versions.

12.x -> 13.x
------------

* Remove all ``run.cache-dotcache`` keys. If it was set to ``true``, replace it
with:

.. code-block:: yaml

run:
use-caches: [checkout, <caches>]

Where caches can be any of ``cargo``, ``pip``, ``uv`` or ``npm``. If the task
was setting up ``.cache`` for another tool, a mount will need to be created
for it manually. If ``use-caches`` was previously set to ``false``, omit
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Including a block (here here or in a linked doc) about exactly what to add to a task to set up a new cache would be useful (I don't think it's obvious to all readers).

This also makes me wonder if it would be beneficial to allow additional caches to be added in the project-wide or task configuration (so that the mounts would be set up automatically). This is firmly in the category of "future enhancement", though.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think that makes sense. It should be possible by monkeypatching the CACHES dict, but we might want some kind of registry / decorator thing like we have for other things.

``checkout`` in the example above. If ``use-caches`` was previously set to
``true``, replace ``true`` with the value above (including ``checkout``).

11.x -> 12.x
------------

Expand Down Expand Up @@ -110,7 +127,7 @@ This page can help when migrating Taskgraph across major versions.
5.x -> 6.x
----------

* Replace all uses of ``command-context` with the more generalized ``task-context``
* Replace all uses of ``command-context`` with the more generalized ``task-context``

4.x -> 5.x
----------
Expand Down
10 changes: 7 additions & 3 deletions packages/pytest-taskgraph/src/pytest_taskgraph/fixtures/gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,11 @@ def inner(
if extra_params:
parameters.update(extra_params)
if extra_graph_config:
graph_config._config.update(extra_graph_config)
# We need this intermediate variable because `GraphConfig` is
# frozen and we can't set attributes on it.
new_graph_config = merge(graph_config._config, extra_graph_config)
graph_config._config.update(new_graph_config)

return TransformConfig(
"test",
str(here),
Expand All @@ -220,12 +224,12 @@ def inner(

@pytest.fixture
def run_transform(make_transform_config):
def inner(func, tasks, config=None):
def inner(func, tasks, config=None, **kwargs):
if not isinstance(tasks, list):
tasks = [tasks]

if not config:
config = make_transform_config()
config = make_transform_config(**kwargs)
return list(func(config, tasks))

return inner
Expand Down
15 changes: 13 additions & 2 deletions src/taskgraph/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from voluptuous import All, Any, Extra, Length, Optional, Required

from .util import path
from .util.caches import CACHES
from .util.python_path import find_object
from .util.schema import Schema, optionally_keyed_by, validate_schema
from .util.yaml import load_yaml
Expand Down Expand Up @@ -74,6 +75,16 @@
"index-path-regexes",
description="Regular expressions matching index paths to be summarized.",
): [str],
Optional(
"run",
description="Configuration related to the 'run' transforms.",
): {
Optional(
"use-caches",
description="List of caches to enable, or a boolean to "
"enable/disable all of them.",
): Any(bool, list(CACHES.keys())),
},
Required("repositories"): All(
{
str: {
Expand Down Expand Up @@ -106,8 +117,8 @@ def __getitem__(self, name):
def __contains__(self, name):
return name in self._config

def get(self, name):
return self._config.get(name)
def get(self, name, default=None):
return self._config.get(name, default)

def register(self):
"""
Expand Down
109 changes: 72 additions & 37 deletions src/taskgraph/transforms/run/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@
consistency.
"""

import hashlib
import json
from typing import Any, Dict, List, Union

from taskgraph.transforms.base import TransformConfig
from taskgraph.util import path
from taskgraph.util.caches import CACHES, get_checkout_dir
from taskgraph.util.taskcluster import get_artifact_prefix


Expand All @@ -32,10 +34,10 @@ def add_cache(task, taskdesc, name, mount_point, skip_untrusted=False):
skip_untrusted (bool): Whether cache is used in untrusted environments
(default: False). Only applies to docker-worker.
"""
if not task["run"].get("use-caches", True):
return

worker = task["worker"]
if worker["implementation"] not in ("docker-worker", "generic-worker"):
# caches support not implemented
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems a bit odd to me that add_cache silently does nothing in certain cases. This is clearly existing behaviour though, and I imagine it would break a lot of things if it changed, so it's certainly not a blocker here.

return

if worker["implementation"] == "docker-worker":
taskdesc["worker"].setdefault("caches", []).append(
Expand All @@ -55,10 +57,6 @@ def add_cache(task, taskdesc, name, mount_point, skip_untrusted=False):
}
)

else:
# Caches not implemented
pass


def add_artifacts(config, task, taskdesc, path):
taskdesc["worker"].setdefault("artifacts", []).append(
Expand Down Expand Up @@ -92,46 +90,19 @@ def support_vcs_checkout(config, task, taskdesc, repo_configs, sparse=False):
reserved for ``run-task`` tasks.
"""
worker = task["worker"]
is_mac = worker["os"] == "macosx"
assert worker["os"] in ("linux", "macosx", "windows")
is_win = worker["os"] == "windows"
is_linux = worker["os"] == "linux"
is_docker = worker["implementation"] == "docker-worker"
assert is_mac or is_win or is_linux

checkoutdir = get_checkout_dir(task)
if is_win:
checkoutdir = "build"
hgstore = "y:/hg-shared"
elif is_docker:
checkoutdir = "{workdir}/checkouts".format(**task["run"])
hgstore = f"{checkoutdir}/hg-store"
else:
checkoutdir = "checkouts"
hgstore = f"{checkoutdir}/hg-shared"

vcsdir = f"{checkoutdir}/{get_vcsdir_name(worker['os'])}"
cache_name = "checkouts"

# Robust checkout does not clean up subrepositories, so ensure that tasks
# that checkout different sets of paths have separate caches.
# See https://bugzilla.mozilla.org/show_bug.cgi?id=1631610
if len(repo_configs) > 1:
checkout_paths = {
"\t".join([repo_config.path, repo_config.prefix])
for repo_config in sorted(
repo_configs.values(), key=lambda repo_config: repo_config.path
)
}
checkout_paths_str = "\n".join(checkout_paths).encode("utf-8")
digest = hashlib.sha256(checkout_paths_str).hexdigest()
cache_name += f"-repos-{digest}"

# Sparse checkouts need their own cache because they can interfere
# with clients that aren't sparse aware.
if sparse:
cache_name += "-sparse"

add_cache(task, taskdesc, cache_name, checkoutdir)

env = taskdesc["worker"].setdefault("env", {})
env.update(
{
Expand Down Expand Up @@ -166,3 +137,67 @@ def support_vcs_checkout(config, task, taskdesc, repo_configs, sparse=False):
taskdesc["worker"]["taskcluster-proxy"] = True

return vcsdir


def should_use_cache(
name: str,
use_caches: Union[bool, List[str]],
has_checkout: bool,
) -> bool:
# Never enable the checkout cache if there's no clone. This allows
# 'checkout' to be specified as a default cache without impacting
# irrelevant tasks.
if name == "checkout" and not has_checkout:
return False

if isinstance(use_caches, bool):
return use_caches

return name in use_caches


def support_caches(
config: TransformConfig, task: Dict[str, Any], taskdesc: Dict[str, Any]
):
"""Add caches for common tools."""
run = task["run"]
worker = task["worker"]
workdir = run.get("workdir")
base_cache_dir = ".task-cache"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's probably fine to put these caches outside of their default place, and obviously it worked fine with all your testing with Gecko. I anticipate that some edge case tasks or issues will come up though (eg: scripts or other things that are trying to either pull things from a cache, or inject them, and not paying attention to the env vars when doing so). That's not something that ought to stop this work, but it may cause unexpected bustage when this is picked up in various places.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call out!

The reason I couldn't use .cache is that when setting e.g VOLUME /builds/worker/.cache/pip in the the Dockerfile, image-builder actually creates that directory with root. The run-task script then chown's all VOLUMEs to the worker user, but it doesn't handle parent directories, so ~/.cache/pip ends up owned by worker, but ~/.cache ends up owned by root. This prevented Gecko from writing to ~/.cache and broke some tests in a subtle way ><. We could fix run-task to chown the whole path to a VOLUME, but I thought it would be cleanest to just use an entirely separate directory.

Another issue is that the default place is different for every platform. So we would need logic in Taskgraph that detects which platform the task is running and tries to set the default place accordingly per tool. This is further complicated by the fact that under Windows the default cache dir is under %LOCALAPPDATA%, which I'm not sure generic-worker would be capable of resolving? We could probably implement this, but being explicit with the envs seemed simpler.

if worker["implementation"] == "docker-worker":
workdir = workdir or "/builds/worker"
base_cache_dir = path.join(workdir, base_cache_dir)

use_caches = run.get("use-caches")
if use_caches is None:
# Use project default values for filtering caches, default to
# checkout cache if no selection is specified.
use_caches = (
config.graph_config.get("taskgraph", {})
.get("run", {})
.get("use-caches", ["checkout"])
)

for name, cache_cfg in CACHES.items():
if not should_use_cache(name, use_caches, run["checkout"]):
continue

if "cache_dir" in cache_cfg:
assert callable(cache_cfg["cache_dir"])
cache_dir = cache_cfg["cache_dir"](task)
else:
cache_dir = f"{base_cache_dir}/{name}"

if "cache_name" in cache_cfg:
assert callable(cache_cfg["cache_name"])
cache_name = cache_cfg["cache_name"](config, task)
else:
cache_name = name

if cache_cfg.get("env"):
env = taskdesc["worker"].setdefault("env", {})
# If cache_dir is already absolute, the `.join` call returns it as
# is. In that case, {task_workdir} will get interpolated by
# run-task.
env[cache_cfg["env"]] = path.join("{task_workdir}", cache_dir)
add_cache(task, taskdesc, cache_name, cache_dir)
36 changes: 11 additions & 25 deletions src/taskgraph/transforms/run/run_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
from voluptuous import Any, Optional, Required

from taskgraph.transforms.run import run_task_using
from taskgraph.transforms.run.common import support_vcs_checkout
from taskgraph.transforms.run.common import (
support_caches,
support_vcs_checkout,
)
from taskgraph.transforms.task import taskref_or_string
from taskgraph.util import path, taskcluster
from taskgraph.util.caches import CACHES
from taskgraph.util.schema import Schema

EXEC_COMMANDS = {
Expand All @@ -24,12 +28,11 @@
run_task_schema = Schema(
{
Required("using"): "run-task",
# if true, add a cache at ~worker/.cache, which is where things like pip
# tend to hide their caches. This cache is never added for level-1 tasks.
# TODO Once bug 1526028 is fixed, this and 'use-caches' should be merged.
Required("cache-dotcache"): bool,
# Whether or not to use caches.
Optional("use-caches"): bool,
# Which caches to use. May take a boolean in which case either all
# (True) or no (False) caches will be used. Alternatively, it can
# accept a list of caches to enable. Defaults to only the checkout cache
# enabled.
Optional("use-caches", "caches"): Any(bool, list(CACHES.keys())),
# if true (the default), perform a checkout on the worker
Required("checkout"): Any(bool, {str: dict}),
Optional(
Expand Down Expand Up @@ -103,11 +106,11 @@ def common_setup(config, task, taskdesc, command):
if "cwd" in run:
command.extend(("--task-cwd", run["cwd"]))

support_caches(config, task, taskdesc)
taskdesc["worker"].setdefault("env", {})["MOZ_SCM_LEVEL"] = config.params["level"]


worker_defaults = {
"cache-dotcache": False,
"checkout": True,
"sparse-profile": None,
"run-as-root": False,
Expand All @@ -134,16 +137,6 @@ def docker_worker_run_task(config, task, taskdesc):
command = run.pop("run-task-command", ["/usr/local/bin/run-task"])
common_setup(config, task, taskdesc, command)

if run.get("cache-dotcache"):
worker["caches"].append(
{
"type": "persistent",
"name": "{project}-dotcache".format(**config.params),
"mount-point": "{workdir}/.cache".format(**run),
"skip-untrusted": True,
}
)

run_command = run["command"]

# dict is for the case of `{'task-reference': str}`.
Expand Down Expand Up @@ -176,13 +169,6 @@ def generic_worker_run_task(config, task, taskdesc):
common_setup(config, task, taskdesc, command)

worker.setdefault("mounts", [])
if run.get("cache-dotcache"):
worker["mounts"].append(
{
"cache-name": "{project}-dotcache".format(**config.params),
"directory": "{workdir}/.cache".format(**run),
}
)
worker["mounts"].append(
{
"content": {
Expand Down
57 changes: 57 additions & 0 deletions src/taskgraph/util/caches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import hashlib
from typing import TYPE_CHECKING, Any, Dict

if TYPE_CHECKING:
from taskgraph.transforms.base import TransformConfig


def get_checkout_dir(task: Dict[str, Any]) -> str:
worker = task["worker"]
if worker["os"] == "windows":
return "build"
elif worker["implementation"] == "docker-worker":
return f"{task['run']['workdir']}/checkouts"
else:
return "checkouts"


def get_checkout_cache_name(config: "TransformConfig", task: Dict[str, Any]) -> str:
repo_configs = config.repo_configs
cache_name = "checkouts"

# Robust checkout does not clean up subrepositories, so ensure that tasks
# that checkout different sets of paths have separate caches.
# See https://bugzilla.mozilla.org/show_bug.cgi?id=1631610
if len(repo_configs) > 1:
checkout_paths = {
"\t".join([repo_config.path, repo_config.prefix])
for repo_config in sorted(
repo_configs.values(), key=lambda repo_config: repo_config.path
)
}
checkout_paths_str = "\n".join(checkout_paths).encode("utf-8")
digest = hashlib.sha256(checkout_paths_str).hexdigest()
cache_name += f"-repos-{digest}"

# Sparse checkouts need their own cache because they can interfere
# with clients that aren't sparse aware.
if task["run"]["sparse-profile"]:
cache_name += "-sparse"

return cache_name


CACHES = {
"cargo": {"env": "CARGO_HOME"},
"checkout": {
"cache_dir": get_checkout_dir,
"cache_name": get_checkout_cache_name,
},
"npm": {"env": "npm_config_cache"},
"pip": {"env": "PIP_CACHE_DIR"},
"uv": {"env": "UV_CACHE_DIR"},
}
2 changes: 1 addition & 1 deletion taskcluster/docker/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ FROM debian:bookworm-slim
LABEL maintainer="Release Engineering <release@mozilla.com>"

VOLUME /builds/worker/checkouts
VOLUME /builds/worker/.cache
VOLUME /builds/worker/.task-cache/uv

# Add worker user
RUN mkdir -p /builds && \
Expand Down
Loading
Loading