From 98f8464b288e1a431c204801401bf688302f8486 Mon Sep 17 00:00:00 2001 From: Bastien Orivel Date: Fri, 17 Oct 2025 14:54:24 +0200 Subject: [PATCH 1/2] Force `Graph` to actually use frozensets Because type annotations in python are a "I wish people would respect my authority" kind of feature, we cannot guarantee that a `Graph` is actually created with frozensets as attributes. That is the case all throughout taskgraph where we're passing sets. By forcing those attributes to be frozensets we can ensure that `Graph` is hashable which will allow us to use `functools.lru_cache` to cache some expensive functions in here. Unfortunately, there's no easy way of transforming the inputs passed into a dataclass, since in both `__init__` and `__post_init__`, the structure is already frozen. By using a dataclass as the parent of `Graph` instead of `Graph` itself, we can work around the issue while keeping the frozenness of the dataclass post init without having to resort to working around dataclass when transforming the attributes --- src/taskgraph/graph.py | 13 +++++++++---- src/taskgraph/morph.py | 2 +- src/taskgraph/optimize/base.py | 4 +++- src/taskgraph/taskgraph.py | 2 +- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/taskgraph/graph.py b/src/taskgraph/graph.py index c0c089035..8c143d448 100644 --- a/src/taskgraph/graph.py +++ b/src/taskgraph/graph.py @@ -8,7 +8,12 @@ @dataclass(frozen=True) -class Graph: +class _Graph: + nodes: frozenset + edges: frozenset + + +class Graph(_Graph): """Generic representation of a directed acyclic graph with labeled edges connecting the nodes. Graph operations are implemented in a functional manner, so the data structure is immutable. @@ -23,8 +28,8 @@ class Graph: node `left` to node `right`.. """ - nodes: frozenset - edges: frozenset + def __init__(self, nodes, edges): + super().__init__(frozenset(nodes), frozenset(edges)) def transitive_closure(self, nodes, reverse=False): """Return the transitive closure of : the graph containing all @@ -67,7 +72,7 @@ def transitive_closure(self, nodes, reverse=False): add_nodes = {(left if reverse else right) for (left, right, _) in add_edges} new_nodes = nodes | add_nodes new_edges = edges | add_edges - return Graph(new_nodes, new_edges) # type: ignore + return Graph(new_nodes, new_edges) def _visit(self, reverse): queue = collections.deque(sorted(self.nodes)) diff --git a/src/taskgraph/morph.py b/src/taskgraph/morph.py index 01af7778f..357741f24 100644 --- a/src/taskgraph/morph.py +++ b/src/taskgraph/morph.py @@ -51,7 +51,7 @@ def amend_taskgraph(taskgraph, label_to_taskid, to_add): for depname, dep in task.dependencies.items(): new_edges.add((task.task_id, dep, depname)) - taskgraph = TaskGraph(new_tasks, Graph(set(new_tasks), new_edges)) # type: ignore + taskgraph = TaskGraph(new_tasks, Graph(frozenset(new_tasks), new_edges)) return taskgraph, label_to_taskid diff --git a/src/taskgraph/optimize/base.py b/src/taskgraph/optimize/base.py index 34d54f503..3d2776a87 100644 --- a/src/taskgraph/optimize/base.py +++ b/src/taskgraph/optimize/base.py @@ -451,7 +451,9 @@ def get_subgraph( if left in tasks_by_taskid and right in tasks_by_taskid } - return TaskGraph(tasks_by_taskid, Graph(set(tasks_by_taskid), edges_by_taskid)) # type: ignore + return TaskGraph( + tasks_by_taskid, Graph(frozenset(tasks_by_taskid), edges_by_taskid) + ) @register_strategy("never") diff --git a/src/taskgraph/taskgraph.py b/src/taskgraph/taskgraph.py index a94f615a2..839fcb385 100644 --- a/src/taskgraph/taskgraph.py +++ b/src/taskgraph/taskgraph.py @@ -67,5 +67,5 @@ def from_json(cls, tasks_dict): tasks[key].task_id = value["task_id"] for depname, dep in value["dependencies"].items(): edges.add((key, dep, depname)) - task_graph = cls(tasks, Graph(set(tasks), edges)) # type: ignore + task_graph = cls(tasks, Graph(frozenset(tasks), edges)) return tasks, task_graph From 338c4f98681770e18c701939f8af55c78d04beda Mon Sep 17 00:00:00 2001 From: Bastien Orivel Date: Fri, 17 Oct 2025 11:52:51 +0200 Subject: [PATCH 2/2] Optimize graph traversal by using a more appropriate algorithm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous implementation could visit nodes multiple times when a graph had diamond like patterns. The new implementation uses [Kahn's algorithm](https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm) which ensures that we only ever visit nodes once. I tested this on firefox-main with `./mach taskgraph full -J -p taskcluster/test/params/mc-onpush.yml`. Before this change, with a profiler attached, it would take ~3mn55. After this change, we're down to ~2mn30. Without profiling we go from 22s to 17s. I added a `links_and_reverse_links_dict` helper here since computing them both at the same time is faster than iterating over the graph twice. In my testing the function was called 11 times, 10 of which were on the exact same graph. Because that function is very expensive to call, I chose to cache it. The caching itself resulted in ~5-6s of time save when profiled on the full graph for firefox and ~1s when not profiled. Hyperfine output, full firefox graph, no profiler attached: ``` Benchmark 1: baseline Time (mean ± σ): 22.565 s ± 0.260 s [User: 53.205 s, System: 5.034 s] Range (min … max): 22.255 s … 23.043 s 10 runs Benchmark 2: with-patch Time (mean ± σ): 19.032 s ± 0.110 s [User: 49.556 s, System: 4.829 s] Range (min … max): 18.856 s … 19.195 s 10 runs Benchmark 3: with-patch-cache Time (mean ± σ): 17.800 s ± 0.250 s [User: 48.605 s, System: 4.893 s] Range (min … max): 17.281 s … 18.159 s 10 runs Summary with-patch-cache ran 1.07 ± 0.02 times faster than with-patch 1.27 ± 0.02 times faster than baseline ``` --- src/taskgraph/graph.py | 45 +++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/taskgraph/graph.py b/src/taskgraph/graph.py index 8c143d448..9951c696f 100644 --- a/src/taskgraph/graph.py +++ b/src/taskgraph/graph.py @@ -4,6 +4,7 @@ import collections +import functools from dataclasses import dataclass @@ -75,20 +76,25 @@ def transitive_closure(self, nodes, reverse=False): return Graph(new_nodes, new_edges) def _visit(self, reverse): - queue = collections.deque(sorted(self.nodes)) - links_by_node = self.reverse_links_dict() if reverse else self.links_dict() - seen = set() + forward_links, reverse_links = self.links_and_reverse_links_dict() + + dependencies = reverse_links if reverse else forward_links + dependents = forward_links if reverse else reverse_links + + indegree = {node: len(dependencies[node]) for node in self.nodes} + + queue = collections.deque( + node for node, degree in indegree.items() if degree == 0 + ) + while queue: node = queue.popleft() - if node in seen: - continue - links = links_by_node[node] - if all((n in seen) for n in links): - seen.add(node) - yield node - else: - queue.extend(n for n in links if n not in seen) - queue.append(node) + yield node + + for dependent in dependents[node]: + indegree[dependent] -= 1 + if indegree[dependent] == 0: + queue.append(dependent) def visit_postorder(self): """ @@ -107,6 +113,21 @@ def visit_preorder(self): """ return self._visit(True) + @functools.cache + def links_and_reverse_links_dict(self): + """ + Return both links and reverse_links dictionaries. + Returns a (forward_links, reverse_links) tuple where forward_links maps + each node to the set of nodes it links to, and reverse_links maps each + node to the set of nodes linking to it. + """ + forward = collections.defaultdict(set) + reverse = collections.defaultdict(set) + for left, right, _ in self.edges: + forward[left].add(right) + reverse[right].add(left) + return (forward, reverse) + def links_dict(self): """ Return a dictionary mapping each node to a set of the nodes it links to