From 98f8464b288e1a431c204801401bf688302f8486 Mon Sep 17 00:00:00 2001
From: Bastien Orivel <borivel@mozilla.com>
Date: Fri, 17 Oct 2025 14:54:24 +0200
Subject: [PATCH 1/2] Force `Graph` to actually use frozensets

Because type annotations in python are a "I wish people would respect my
authority" kind of feature, we cannot guarantee that a `Graph` is
actually created with frozensets as attributes. That is the case all
throughout taskgraph where we're passing sets. By forcing those
attributes to be frozensets we can ensure that `Graph` is hashable which
will allow us to use `functools.lru_cache` to cache some expensive
functions in here.

Unfortunately, there's no easy way of transforming the inputs passed
into a dataclass, since in both `__init__` and `__post_init__`, the
structure is already frozen. By using a dataclass as the parent of
`Graph` instead of `Graph` itself, we can work around the issue while
keeping the frozenness of the dataclass post init without having to
resort to working around dataclass when transforming the attributes
---
 src/taskgraph/graph.py         | 13 +++++++++----
 src/taskgraph/morph.py         |  2 +-
 src/taskgraph/optimize/base.py |  4 +++-
 src/taskgraph/taskgraph.py     |  2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/src/taskgraph/graph.py b/src/taskgraph/graph.py
index c0c089035..8c143d448 100644
--- a/src/taskgraph/graph.py
+++ b/src/taskgraph/graph.py
@@ -8,7 +8,12 @@
 
 
 @dataclass(frozen=True)
-class Graph:
+class _Graph:
+    nodes: frozenset
+    edges: frozenset
+
+
+class Graph(_Graph):
     """Generic representation of a directed acyclic graph with labeled edges
     connecting the nodes. Graph operations are implemented in a functional
     manner, so the data structure is immutable.
@@ -23,8 +28,8 @@ class Graph:
     node `left` to node `right`..
     """
 
-    nodes: frozenset
-    edges: frozenset
+    def __init__(self, nodes, edges):
+        super().__init__(frozenset(nodes), frozenset(edges))
 
     def transitive_closure(self, nodes, reverse=False):
         """Return the transitive closure of <nodes>: the graph containing all
@@ -67,7 +72,7 @@ def transitive_closure(self, nodes, reverse=False):
             add_nodes = {(left if reverse else right) for (left, right, _) in add_edges}
             new_nodes = nodes | add_nodes
             new_edges = edges | add_edges
-        return Graph(new_nodes, new_edges)  # type: ignore
+        return Graph(new_nodes, new_edges)
 
     def _visit(self, reverse):
         queue = collections.deque(sorted(self.nodes))
diff --git a/src/taskgraph/morph.py b/src/taskgraph/morph.py
index 01af7778f..357741f24 100644
--- a/src/taskgraph/morph.py
+++ b/src/taskgraph/morph.py
@@ -51,7 +51,7 @@ def amend_taskgraph(taskgraph, label_to_taskid, to_add):
         for depname, dep in task.dependencies.items():
             new_edges.add((task.task_id, dep, depname))
 
-    taskgraph = TaskGraph(new_tasks, Graph(set(new_tasks), new_edges))  # type: ignore
+    taskgraph = TaskGraph(new_tasks, Graph(frozenset(new_tasks), new_edges))
     return taskgraph, label_to_taskid
 
 
diff --git a/src/taskgraph/optimize/base.py b/src/taskgraph/optimize/base.py
index 34d54f503..3d2776a87 100644
--- a/src/taskgraph/optimize/base.py
+++ b/src/taskgraph/optimize/base.py
@@ -451,7 +451,9 @@ def get_subgraph(
         if left in tasks_by_taskid and right in tasks_by_taskid
     }
 
-    return TaskGraph(tasks_by_taskid, Graph(set(tasks_by_taskid), edges_by_taskid))  # type: ignore
+    return TaskGraph(
+        tasks_by_taskid, Graph(frozenset(tasks_by_taskid), edges_by_taskid)
+    )
 
 
 @register_strategy("never")
diff --git a/src/taskgraph/taskgraph.py b/src/taskgraph/taskgraph.py
index a94f615a2..839fcb385 100644
--- a/src/taskgraph/taskgraph.py
+++ b/src/taskgraph/taskgraph.py
@@ -67,5 +67,5 @@ def from_json(cls, tasks_dict):
                 tasks[key].task_id = value["task_id"]
             for depname, dep in value["dependencies"].items():
                 edges.add((key, dep, depname))
-        task_graph = cls(tasks, Graph(set(tasks), edges))  # type: ignore
+        task_graph = cls(tasks, Graph(frozenset(tasks), edges))
         return tasks, task_graph

From 338c4f98681770e18c701939f8af55c78d04beda Mon Sep 17 00:00:00 2001
From: Bastien Orivel <borivel@mozilla.com>
Date: Fri, 17 Oct 2025 11:52:51 +0200
Subject: [PATCH 2/2] Optimize graph traversal by using a more appropriate
 algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous implementation could visit nodes multiple times when a
graph had diamond like patterns. The new implementation uses [Kahn's
algorithm](https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm)
which ensures that we only ever visit nodes once.

I tested this on firefox-main with `./mach taskgraph full -J -p
taskcluster/test/params/mc-onpush.yml`. Before this change, with a
profiler attached, it would take ~3mn55. After this change, we're down
to ~2mn30. Without profiling we go from 22s to 17s.

I added a `links_and_reverse_links_dict` helper here since computing
them both at the same time is faster than iterating over the graph
twice. In my testing the function was called 11 times, 10 of which were
on the exact same graph. Because that function is very expensive to
call, I chose to cache it.

The caching itself resulted in ~5-6s of time save when profiled on the
full graph for firefox and ~1s when not profiled.

Hyperfine output, full firefox graph, no profiler attached:

```
Benchmark 1: baseline
  Time (mean ± σ):     22.565 s ±  0.260 s    [User: 53.205 s, System: 5.034 s]
  Range (min … max):   22.255 s … 23.043 s    10 runs

Benchmark 2: with-patch
  Time (mean ± σ):     19.032 s ±  0.110 s    [User: 49.556 s, System: 4.829 s]
  Range (min … max):   18.856 s … 19.195 s    10 runs

Benchmark 3: with-patch-cache
  Time (mean ± σ):     17.800 s ±  0.250 s    [User: 48.605 s, System: 4.893 s]
  Range (min … max):   17.281 s … 18.159 s    10 runs

Summary
  with-patch-cache ran
    1.07 ± 0.02 times faster than with-patch
    1.27 ± 0.02 times faster than baseline
```
---
 src/taskgraph/graph.py | 45 +++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/taskgraph/graph.py b/src/taskgraph/graph.py
index 8c143d448..9951c696f 100644
--- a/src/taskgraph/graph.py
+++ b/src/taskgraph/graph.py
@@ -4,6 +4,7 @@
 
 
 import collections
+import functools
 from dataclasses import dataclass
 
 
@@ -75,20 +76,25 @@ def transitive_closure(self, nodes, reverse=False):
         return Graph(new_nodes, new_edges)
 
     def _visit(self, reverse):
-        queue = collections.deque(sorted(self.nodes))
-        links_by_node = self.reverse_links_dict() if reverse else self.links_dict()
-        seen = set()
+        forward_links, reverse_links = self.links_and_reverse_links_dict()
+
+        dependencies = reverse_links if reverse else forward_links
+        dependents = forward_links if reverse else reverse_links
+
+        indegree = {node: len(dependencies[node]) for node in self.nodes}
+
+        queue = collections.deque(
+            node for node, degree in indegree.items() if degree == 0
+        )
+
         while queue:
             node = queue.popleft()
-            if node in seen:
-                continue
-            links = links_by_node[node]
-            if all((n in seen) for n in links):
-                seen.add(node)
-                yield node
-            else:
-                queue.extend(n for n in links if n not in seen)
-                queue.append(node)
+            yield node
+
+            for dependent in dependents[node]:
+                indegree[dependent] -= 1
+                if indegree[dependent] == 0:
+                    queue.append(dependent)
 
     def visit_postorder(self):
         """
@@ -107,6 +113,21 @@ def visit_preorder(self):
         """
         return self._visit(True)
 
+    @functools.cache
+    def links_and_reverse_links_dict(self):
+        """
+        Return both links and reverse_links dictionaries.
+        Returns a (forward_links, reverse_links) tuple where forward_links maps
+        each node to the set of nodes it links to, and reverse_links maps each
+        node to the set of nodes linking to it.
+        """
+        forward = collections.defaultdict(set)
+        reverse = collections.defaultdict(set)
+        for left, right, _ in self.edges:
+            forward[left].add(right)
+            reverse[right].add(left)
+        return (forward, reverse)
+
     def links_dict(self):
         """
         Return a dictionary mapping each node to a set of the nodes it links to