From faf6ba4baaeac9a7874af6b9592170776aebf2dc Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 8 Oct 2025 13:51:09 -0400 Subject: [PATCH 1/2] refactor!: rename base_rev -> base in vcs.get_changed_files() This function works with anything committish, update the argument name to reflect that (while we're breaking backwards compat anyway). --- src/taskgraph/util/vcs.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/taskgraph/util/vcs.py b/src/taskgraph/util/vcs.py index 80f08e4da..e2d887771 100644 --- a/src/taskgraph/util/vcs.py +++ b/src/taskgraph/util/vcs.py @@ -140,12 +140,12 @@ def get_changed_files( diff_filter: Optional[str], mode: Optional[str], rev: Optional[str], - base_rev: Optional[str], + base: Optional[str], ) -> List[str]: """Return a list of files that are changed in: * either this repository's working copy, * or at a given revision (``rev``) - * or between 2 revisions (``base_rev`` and ``rev``) + * or between 2 revisions (``base`` and ``rev``) ``diff_filter`` controls which kinds of modifications are returned. It is a string which may only contain the following characters: @@ -162,9 +162,9 @@ def get_changed_files( ``rev`` is a specifier for which changesets to consider for changes. The exact meaning depends on the vcs system being used. - ``base_rev`` specifies the range of changesets. This parameter cannot + ``base`` specifies the range of changesets. This parameter cannot be used without ``rev``. The range includes ``rev`` but excludes - ``base_rev``. + ``base``. """ @abstractmethod @@ -293,17 +293,17 @@ def get_tracked_files(self, *paths, rev=None): rev = rev or "." return self.run("files", "-r", rev, *paths).splitlines() - def get_changed_files(self, diff_filter=None, mode=None, rev=None, base_rev=None): + def get_changed_files(self, diff_filter=None, mode=None, rev=None, base=None): diff_filter = diff_filter or "ADM" if rev is None: - if base_rev is not None: - raise ValueError("Cannot specify `base_rev` without `rev`") + if base is not None: + raise ValueError("Cannot specify `base` without `rev`") # Use --no-status to print just the filename. df = self._format_diff_filter(diff_filter, for_status=True) return self.run("status", "--no-status", f"-{df}").splitlines() else: template = self._files_template(diff_filter) - revision_argument = rev if base_rev is None else f"{rev} % {base_rev}" + revision_argument = rev if base is None else f"{rev} % {base}" return self.run("log", "-r", revision_argument, "-T", template).splitlines() def get_outgoing_files(self, diff_filter="ADM", upstream=None): @@ -479,23 +479,21 @@ def get_tracked_files(self, *paths, rev=None): rev = rev or "HEAD" return self.run("ls-tree", "-r", "--name-only", rev, *paths).splitlines() - def get_changed_files(self, diff_filter=None, mode=None, rev=None, base_rev=None): + def get_changed_files(self, diff_filter=None, mode=None, rev=None, base=None): diff_filter = diff_filter or "ADM" mode = mode or "unstaged" assert all(f.lower() in self._valid_diff_filter for f in diff_filter) if rev is None: - if base_rev is not None: - raise ValueError("Cannot specify `base_rev` without `rev`") + if base is not None: + raise ValueError("Cannot specify `base` without `rev`") cmd = ["diff"] if mode == "staged": cmd.append("--cached") elif mode == "all": cmd.append("HEAD") else: - revision_argument = ( - f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}" - ) + revision_argument = f"{rev}~1..{rev}" if base is None else f"{base}..{rev}" cmd = ["log", "--format=format:", revision_argument] cmd.append("--name-only") From 90a856b0bbd7b33313872a413eee6535fdc94b26 Mon Sep 17 00:00:00 2001 From: Andrew Halberstadt Date: Wed, 8 Oct 2025 13:51:09 -0400 Subject: [PATCH 2/2] fix!: remove post processing of base_ref and base_rev parameters BREAKING CHANGE: the base_ref and base_rev parameters will now match what was passed into the Decision task exactly. This means `base_ref` will not get reset to the repo's default branch, and `base_rev` will no longer be reset to the merge base. ### Why we were previously post-processing these parameters Github push and pull_request events almost never contain the base ref, and often don't contain the base rev. For base ref, it only ever seems to be present when pushing a new branch. For base rev it is missing in the following cases: 1. All pull_request events. The value we pass in is actually the revision that the base ref happens to be pointing at when the event is fired. 2. Force pushes. The value we pass in is actually the old unreachable head rev that used to be on the branch Really, we only ever get a proper base rev when doing normal fast-forward pushes. The reason we need a base rev in the first place, is to compute the files changed. So let's say there's a PR with the following graph: A -> B -> C -> D (main) \ E -> F (PR) Based on the Github event, `D` is being passed in as `base_rev` and `F` is being passed in as `head_rev`. The post-processing was essentially running `git merge-base` to find `B`, so that we could then use `git log` to find all files touched between `B..F`. ### Why the post-processing isn't actually necessary It turns out that `git log D..F` already does the right thing! It means, show me all commits reachable from `F`, but not reachable from `D`, which is exactly what we want. Only files changed in `E` and `F` will be included. So there's no benefit of using `merge-base` for the purposes of finding changed files. As far as the `base_ref` post-processing goes, all we were doing was setting it to `repo.default_branch` if it doesn't exist (which is almost always for pushes). While this likely works in most cases, we don't actually have any idea what the real `base_ref` is, so it's not correct to be doing this. Since we don't even need `base_ref` to determine files changed, I think it's fine to leave it as `None` in the event it wasn't passed in. ### Risks with this patch While removing this should be fine for the purposes of determining changed files, there might be other use cases that projects are using the `merge-base` for. Such projects may need to determine the merge-base on their own, therefore this patch is backwards incompatible. ### Future improvements I think we should consider removing or renaming the `base_ref` and `base_rev` parameters. As outlined, they are misleading (at least with Github based repos) and don't actually represent the "base revision". But for now, I'll save that particular change for another day. --- src/taskgraph/decision.py | 81 +-------------------------------- src/taskgraph/parameters.py | 2 +- test/test_decision.py | 91 +------------------------------------ 3 files changed, 5 insertions(+), 169 deletions(-) diff --git a/src/taskgraph/decision.py b/src/taskgraph/decision.py index e75bc3014..a0c5381a8 100644 --- a/src/taskgraph/decision.py +++ b/src/taskgraph/decision.py @@ -21,7 +21,7 @@ from taskgraph.util import json from taskgraph.util.python_path import find_object from taskgraph.util.schema import Schema, validate_schema -from taskgraph.util.vcs import Repository, get_repository +from taskgraph.util.vcs import get_repository from taskgraph.util.yaml import load_yaml logger = logging.getLogger(__name__) @@ -192,25 +192,10 @@ def get_decision_parameters(graph_config, options): except UnicodeDecodeError: commit_message = "" - parameters["base_ref"] = _determine_more_accurate_base_ref( - repo, - candidate_base_ref=options.get("base_ref"), - head_ref=options.get("head_ref"), - base_rev=options.get("base_rev"), - ) - - parameters["base_rev"] = _determine_more_accurate_base_rev( - repo, - base_ref=parameters["base_ref"], - candidate_base_rev=options.get("base_rev"), - head_rev=options.get("head_rev"), - env_prefix=_get_env_prefix(graph_config), - ) - # Define default filter list, as most configurations shouldn't need # custom filters. parameters["files_changed"] = repo.get_changed_files( - rev=parameters["head_rev"], base_rev=parameters["base_rev"] + rev=parameters["head_rev"], base=parameters["base_rev"] ) parameters["filters"] = [ "target_tasks_method", @@ -284,68 +269,6 @@ def get_decision_parameters(graph_config, options): return result -def _determine_more_accurate_base_ref(repo, candidate_base_ref, head_ref, base_rev): - base_ref = candidate_base_ref - - if not candidate_base_ref: - base_ref = repo.default_branch - elif candidate_base_ref == head_ref and base_rev == Repository.NULL_REVISION: - logger.info( - "base_ref and head_ref are identical but base_rev equals the null revision. " - "This is a new branch but Github didn't identify its actual base." - ) - base_ref = repo.default_branch - - if base_ref != candidate_base_ref: - logger.info( - f'base_ref has been reset from "{candidate_base_ref}" to "{base_ref}".' - ) - - return base_ref - - -def _determine_more_accurate_base_rev( - repo, base_ref, candidate_base_rev, head_rev, env_prefix -): - if not candidate_base_rev: - logger.info("base_rev is not set.") - base_ref_or_rev = base_ref - elif candidate_base_rev == Repository.NULL_REVISION: - logger.info("base_rev equals the null revision. This branch is a new one.") - base_ref_or_rev = base_ref - elif not repo.does_revision_exist_locally(candidate_base_rev): - logger.warning( - "base_rev does not exist locally. It is likely because the branch was force-pushed. " - "taskgraph is not able to assess how many commits were changed and assumes it is only " - f"the last one. Please set the {env_prefix.upper()}_BASE_REV environment variable " - "in the decision task and provide `--base-rev` to taskgraph." - ) - base_ref_or_rev = base_ref - else: - base_ref_or_rev = candidate_base_rev - - if base_ref_or_rev == base_ref: - logger.info( - f'Using base_ref "{base_ref}" to determine latest common revision...' - ) - - base_rev = repo.find_latest_common_revision(base_ref_or_rev, head_rev) - if base_rev != candidate_base_rev: - if base_ref_or_rev == candidate_base_rev: - logger.info("base_rev is not an ancestor of head_rev.") - - logger.info( - f'base_rev has been reset from "{candidate_base_rev}" to "{base_rev}".' - ) - - return base_rev - - -def _get_env_prefix(graph_config): - repo_keys = list(graph_config["taskgraph"].get("repositories", {}).keys()) - return repo_keys[0] if repo_keys else "" - - def set_try_config(parameters, task_config_file): if os.path.isfile(task_config_file): logger.info(f"using try tasks from {task_config_file}") diff --git a/src/taskgraph/parameters.py b/src/taskgraph/parameters.py index d88a155b7..12f470f15 100644 --- a/src/taskgraph/parameters.py +++ b/src/taskgraph/parameters.py @@ -33,7 +33,7 @@ class ParameterMismatch(Exception): base_schema = Schema( { Required("base_repository"): str, - Required("base_ref"): str, + Optional("base_ref"): str, Required("base_rev"): str, Required("build_date"): int, Required("build_number"): int, diff --git a/test/test_decision.py b/test/test_decision.py index 2b2e4f34e..f3071a938 100644 --- a/test/test_decision.py +++ b/test/test_decision.py @@ -10,8 +10,6 @@ import unittest from pathlib import Path -import pytest - from taskgraph import decision from taskgraph.util.vcs import GitRepository, HgRepository from taskgraph.util.yaml import load_yaml @@ -54,6 +52,7 @@ class TestGetDecisionParameters(unittest.TestCase): def setUp(self): self.options = { "base_repository": "https://hg.mozilla.org/mozilla-unified", + "base_rev": "aaaa", "head_repository": "https://hg.mozilla.org/mozilla-central", "head_rev": "bbbb", "head_ref": "default", @@ -67,27 +66,11 @@ def setUp(self): "tasks_for": "hg-push", "level": "3", } - self.old_determine_more_accurate_base_rev = ( - decision._determine_more_accurate_base_rev - ) - decision._determine_more_accurate_base_rev = lambda *_, **__: "aaaa" - self.old_determine_more_accurate_base_ref = ( - decision._determine_more_accurate_base_ref - ) - decision._determine_more_accurate_base_ref = lambda *_, **__: "aaaa" - - def tearDown(self): - decision._determine_more_accurate_base_rev = ( - self.old_determine_more_accurate_base_rev - ) - decision._determine_more_accurate_base_ref = ( - self.old_determine_more_accurate_base_ref - ) def test_simple_options(self, mock_files_changed): mock_files_changed.return_value = ["foo.txt"] params = decision.get_decision_parameters(FAKE_GRAPH_CONFIG, self.options) - mock_files_changed.assert_called_once_with(rev="bbbb", base_rev="aaaa") + mock_files_changed.assert_called_once_with(rev="bbbb", base="aaaa") self.assertEqual(params["build_date"], 1503691511) self.assertEqual(params["head_tag"], "v0.0.1") self.assertEqual(params["pushlog_id"], "143") @@ -154,73 +137,3 @@ def test_dontbuild_commit_message_yields_default_target_tasks_method( self.options["tasks_for"] = "hg-push" params = decision.get_decision_parameters(FAKE_GRAPH_CONFIG, self.options) self.assertEqual(params["target_tasks_method"], "nothing") - - -@pytest.mark.parametrize( - "candidate_base_ref, base_rev, expected_base_ref", - ( - ("", "base-rev", "default-branch"), - ("head-ref", "base-rev", "head-ref"), - ("head-ref", "0000000000000000000000000000000000000000", "default-branch"), - ), -) -def test_determine_more_accurate_base_ref( - candidate_base_ref, base_rev, expected_base_ref -): - repo_mock = unittest.mock.MagicMock() - repo_mock.default_branch = "default-branch" - - assert ( - decision._determine_more_accurate_base_ref( - repo_mock, candidate_base_ref, "head-ref", base_rev - ) - == expected_base_ref - ) - - -@pytest.mark.parametrize( - "common_rev, candidate_base_rev, expected_base_ref_or_rev, expected_base_rev", - ( - ("found-rev", "", "base-ref", "found-rev"), - ( - "found-rev", - "0000000000000000000000000000000000000000", - "base-ref", - "found-rev", - ), - ("found-rev", "non-existing-rev", "base-ref", "found-rev"), - ("existing-rev", "existing-rev", "existing-rev", "existing-rev"), - ), -) -def test_determine_more_accurate_base_rev( - common_rev, candidate_base_rev, expected_base_ref_or_rev, expected_base_rev -): - repo_mock = unittest.mock.MagicMock() - repo_mock.find_latest_common_revision.return_value = common_rev - repo_mock.does_revision_exist_locally = lambda rev: rev == "existing-rev" - - assert ( - decision._determine_more_accurate_base_rev( - repo_mock, "base-ref", candidate_base_rev, "head-rev", env_prefix="PREFIX" - ) - == expected_base_rev - ) - repo_mock.find_latest_common_revision.assert_called_once_with( - expected_base_ref_or_rev, "head-rev" - ) - - -@pytest.mark.parametrize( - "graph_config, expected_value", - ( - ({"taskgraph": {}}, ""), - ({"taskgraph": {"repositories": {}}}, ""), - ({"taskgraph": {"repositories": {"mobile": {}}}}, "mobile"), - ( - {"taskgraph": {"repositories": {"mobile": {}, "some-other-repo": {}}}}, - "mobile", - ), - ), -) -def test_get_env_prefix(graph_config, expected_value): - assert decision._get_env_prefix(graph_config) == expected_value