refactor!: Made some parameters keyword or positional only (#578)

PierreQuinton · ValerianRey · web-flow · commit db59ec60c8b2 · 2026-02-18T18:07:22.000+01:00
* (Private interface) make `GeneralizedWeighting.forward` take a positional-only argument.
* (Private interface) make `Aggregator.forward` take a single positional only argument.
* Change many parameters of the public interface to be either positional-only or keyword-only. Refer to the changelog change for the exact list.
* Add changelog entry.
---------
Co-authored-by: Valérian Rey &lt;valerian.rey@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -53,8 +53,21 @@ changelog does not include internal changes that do not affect the user.
   mtl_backward(losses, features)
   jac_to_grad(shared_module.parameters(), aggregator)
   ```
-- **BREAKING**: Renamed the `losses` parameter of `mtl_backward` to `tensors`.
-
+- **BREAKING**: Made some parameters of the public interface of `torchjd` positional-only or
+  keyword-only:
+  - `backward`: The `tensors` parameter is now positional-only. Suggested change:
+    `backward(tensors=losses)` => `backward(losses)`. All other parameters are now keyword-only.
+  - `mtl_backward`: The `tensors` parameter (previously named `losses`) is now positional-only.
+    Suggested change: `mtl_backward(losses=losses, features=features)` =>
+    `mtl_backward(losses, features=features)`. The `features` parameter remains usable as positional
+    or keyword. All other parameters are now keyword-only.
+  - `Aggregator.__call__`: The `matrix` parameter is now positonal-only. Suggested change:
+    `aggregator(matrix=matrix)` => `aggregator(matrix)`.
+  - `Weighting.__call__`: The `stat` parameter is now positional-only. Suggested change:
+    `weighting(stat=gramian)` => `weighting(gramian)`.
+  - `GeneralizedWeighting.__call__`: The `generalized_gramian` parameter is now positional-only.
+    Suggested change: `generalized_weighting(generalized_gramian=generalized_gramian)` =>
+    `generalized_weighting(generalized_gramian)`.
 - Removed an unnecessary memory duplication. This should significantly improve the memory efficiency
   of `autojac`.
 - Removed an unnecessary internal cloning of gradient. This should slightly improve the memory
diff --git a/docs/source/examples/amp.rst b/docs/source/examples/amp.rst
@@ -48,7 +48,7 @@ following example shows the resulting code for a multi-task learning use-case.
             loss2 = loss_fn(output2, target2)
 
         scaled_losses = scaler.scale([loss1, loss2])
-        mtl_backward(tensors=scaled_losses, features=features)
+        mtl_backward(scaled_losses, features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         scaler.step(optimizer)
         scaler.update()
diff --git a/docs/source/examples/lightning_integration.rst b/docs/source/examples/lightning_integration.rst
@@ -43,7 +43,7 @@ The following code example demonstrates a basic multi-task learning setup using
             loss2 = mse_loss(output2, target2)
 
             opt = self.optimizers()
-            mtl_backward(tensors=[loss1, loss2], features=features)
+            mtl_backward([loss1, loss2], features=features)
             jac_to_grad(self.feature_extractor.parameters(), UPGrad())
             opt.step()
             opt.zero_grad()
diff --git a/docs/source/examples/monitoring.rst b/docs/source/examples/monitoring.rst
@@ -63,7 +63,7 @@ they have a negative inner product).
         loss1 = loss_fn(output1, target1)
         loss2 = loss_fn(output2, target2)
 
-        mtl_backward(tensors=[loss1, loss2], features=features)
+        mtl_backward([loss1, loss2], features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
diff --git a/docs/source/examples/mtl.rst b/docs/source/examples/mtl.rst
@@ -52,7 +52,7 @@ vectors of dimension 10, and their corresponding scalar labels for both tasks.
         loss1 = loss_fn(output1, target1)
         loss2 = loss_fn(output2, target2)
 
-        mtl_backward(tensors=[loss1, loss2], features=features)
+        mtl_backward([loss1, loss2], features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
diff --git a/src/torchjd/aggregation/_aggregator_bases.py b/src/torchjd/aggregation/_aggregator_bases.py
@@ -25,10 +25,10 @@ def _check_is_matrix(matrix: Tensor) -> None:
             )
 
     @abstractmethod
-    def forward(self, matrix: Matrix) -> Tensor:
+    def forward(self, matrix: Matrix, /) -> Tensor:
         """Computes the aggregation from the input matrix."""
 
-    def __call__(self, matrix: Tensor) -> Tensor:
+    def __call__(self, matrix: Tensor, /) -> Tensor:
         """Computes the aggregation from the input matrix and applies all registered hooks."""
         Aggregator._check_is_matrix(matrix)
         return super().__call__(matrix)
@@ -62,7 +62,7 @@ def combine(matrix: Matrix, weights: Tensor) -> Tensor:
         vector = weights @ matrix
         return vector
 
-    def forward(self, matrix: Matrix) -> Tensor:
+    def forward(self, matrix: Matrix, /) -> Tensor:
         weights = self.weighting(matrix)
         vector = self.combine(matrix, weights)
         return vector
diff --git a/src/torchjd/aggregation/_config.py b/src/torchjd/aggregation/_config.py
@@ -58,7 +58,7 @@ def __init__(self, pref_vector: Tensor | None = None):
         # This prevents computing gradients that can be very wrong.
         self.register_full_backward_pre_hook(raise_non_differentiable_error)
 
-    def forward(self, matrix: Matrix) -> Tensor:
+    def forward(self, matrix: Matrix, /) -> Tensor:
         weights = self.weighting(matrix)
         units = torch.nan_to_num((matrix / (matrix.norm(dim=1)).unsqueeze(1)), 0.0)
         best_direction = torch.linalg.pinv(units) @ weights
diff --git a/src/torchjd/aggregation/_flattening.py b/src/torchjd/aggregation/_flattening.py
@@ -24,7 +24,7 @@ def __init__(self, weighting: Weighting):
         super().__init__()
         self.weighting = weighting
 
-    def forward(self, generalized_gramian: PSDTensor) -> Tensor:
+    def forward(self, generalized_gramian: PSDTensor, /) -> Tensor:
         k = generalized_gramian.ndim // 2
         shape = generalized_gramian.shape[:k]
         square_gramian = flatten(generalized_gramian)
diff --git a/src/torchjd/aggregation/_graddrop.py b/src/torchjd/aggregation/_graddrop.py
@@ -40,7 +40,7 @@ def __init__(self, f: Callable = _identity, leak: Tensor | None = None):
         # This prevents computing gradients that can be very wrong.
         self.register_full_backward_pre_hook(raise_non_differentiable_error)
 
-    def forward(self, matrix: Matrix) -> Tensor:
+    def forward(self, matrix: Matrix, /) -> Tensor:
         self._check_matrix_has_enough_rows(matrix)
 
         if matrix.shape[0] == 0 or matrix.shape[1] == 0:
diff --git a/src/torchjd/aggregation/_trimmed_mean.py b/src/torchjd/aggregation/_trimmed_mean.py
@@ -24,7 +24,7 @@ def __init__(self, trim_number: int):
             )
         self.trim_number = trim_number
 
-    def forward(self, matrix: Tensor) -> Tensor:
+    def forward(self, matrix: Tensor, /) -> Tensor:
         self._check_matrix_has_enough_rows(matrix)
 
         n_rows = matrix.shape[0]
diff --git a/src/torchjd/aggregation/_weighting_bases.py b/src/torchjd/aggregation/_weighting_bases.py
@@ -27,7 +27,7 @@ def __init__(self):
     def forward(self, stat: _T, /) -> Tensor:
         """Computes the vector of weights from the input stat."""
 
-    def __call__(self, stat: Tensor) -> Tensor:
+    def __call__(self, stat: Tensor, /) -> Tensor:
         """Computes the vector of weights from the input stat and applies all registered hooks."""
 
         # The value of _T (e.g. PSDMatrix) is not public, so we need the user-facing type hint of
@@ -67,10 +67,10 @@ def __init__(self):
         super().__init__()
 
     @abstractmethod
-    def forward(self, generalized_gramian: PSDTensor) -> Tensor:
+    def forward(self, generalized_gramian: PSDTensor, /) -> Tensor:
         """Computes the vector of weights from the input generalized Gramian."""
 
-    def __call__(self, generalized_gramian: Tensor) -> Tensor:
+    def __call__(self, generalized_gramian: Tensor, /) -> Tensor:
         """
         Computes the tensor of weights from the input generalized Gramian and applies all registered
         hooks.
diff --git a/src/torchjd/autogram/_engine.py b/src/torchjd/autogram/_engine.py
@@ -235,7 +235,7 @@ def _check_module_is_compatible(self, module: nn.Module) -> None:
                 )
 
     # Currently, the type PSDMatrix is hidden from users, so Tensor is correct.
-    def compute_gramian(self, output: Tensor) -> Tensor:
+    def compute_gramian(self, output: Tensor, /) -> Tensor:
         r"""
         Computes the Gramian of the Jacobian of ``output`` with respect to the direct parameters of
         all ``modules``.
diff --git a/src/torchjd/autojac/_backward.py b/src/torchjd/autojac/_backward.py
@@ -15,6 +15,8 @@
 
 def backward(
     tensors: Sequence[Tensor] | Tensor,
+    /,
+    *,
     jac_tensors: Sequence[Tensor] | Tensor | None = None,
     inputs: Iterable[Tensor] | None = None,
     retain_graph: bool = False,
diff --git a/src/torchjd/autojac/_jac.py b/src/torchjd/autojac/_jac.py
@@ -20,6 +20,7 @@
 def jac(
     outputs: Sequence[Tensor] | Tensor,
     inputs: Iterable[Tensor] | None = None,
+    *,
     jac_outputs: Sequence[Tensor] | Tensor | None = None,
     retain_graph: bool = False,
     parallel_chunk_size: int | None = None,
diff --git a/src/torchjd/autojac/_jac_to_grad.py b/src/torchjd/autojac/_jac_to_grad.py
@@ -11,7 +11,9 @@
 
 def jac_to_grad(
     tensors: Iterable[Tensor],
+    /,
     aggregator: Aggregator,
+    *,
     retain_jac: bool = False,
 ) -> None:
     r"""
diff --git a/src/torchjd/autojac/_mtl_backward.py b/src/torchjd/autojac/_mtl_backward.py
@@ -24,7 +24,9 @@
 
 def mtl_backward(
     tensors: Sequence[Tensor],
+    /,
     features: Sequence[Tensor] | Tensor,
+    *,
     grad_tensors: Sequence[Tensor] | None = None,
     tasks_params: Sequence[Iterable[Tensor]] | None = None,
     shared_params: Iterable[Tensor] | None = None,
diff --git a/tests/doc/test_rst.py b/tests/doc/test_rst.py
@@ -44,7 +44,7 @@ def test_amp():
             loss2 = loss_fn(output2, target2)
 
         scaled_losses = scaler.scale([loss1, loss2])
-        mtl_backward(tensors=scaled_losses, features=features)
+        mtl_backward(scaled_losses, features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         scaler.step(optimizer)
         scaler.update()
@@ -250,7 +250,7 @@ def training_step(self, batch, batch_idx) -> None:
 
             opt = self.optimizers()
 
-            mtl_backward(tensors=[loss1, loss2], features=features)
+            mtl_backward([loss1, loss2], features=features)
             jac_to_grad(self.feature_extractor.parameters(), UPGrad())
             opt.step()
             opt.zero_grad()
@@ -325,7 +325,7 @@ def print_gd_similarity(_, inputs: tuple[torch.Tensor, ...], aggregation: torch.
         loss1 = loss_fn(output1, target1)
         loss2 = loss_fn(output2, target2)
 
-        mtl_backward(tensors=[loss1, loss2], features=features)
+        mtl_backward([loss1, loss2], features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
@@ -363,7 +363,7 @@ def test_mtl():
         loss1 = loss_fn(output1, target1)
         loss2 = loss_fn(output2, target2)
 
-        mtl_backward(tensors=[loss1, loss2], features=features)
+        mtl_backward([loss1, loss2], features=features)
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
diff --git a/tests/unit/autojac/test_backward.py b/tests/unit/autojac/test_backward.py
@@ -310,7 +310,7 @@ def test_input_retaining_grad_fails():
 
     # backward itself doesn't raise the error, but it fills b.grad with a BatchedTensor
     # (and it also fills b.jac with the correct Jacobian)
-    backward(tensors=y, inputs=[b])
+    backward(y, inputs=[b])
 
     with raises(RuntimeError):
         # Using such a BatchedTensor should result in an error
@@ -329,7 +329,7 @@ def test_non_input_retaining_grad_fails():
     y = 3 * b
 
     # backward itself doesn't raise the error, but it fills b.grad with a BatchedTensor
-    backward(tensors=y, inputs=[a])
+    backward(y, inputs=[a])
 
     with raises(RuntimeError):
         # Using such a BatchedTensor should result in an error
diff --git a/tests/unit/autojac/test_jac.py b/tests/unit/autojac/test_jac.py
@@ -309,7 +309,7 @@ def test_input_retaining_grad_fails():
 
     # jac itself doesn't raise the error, but it fills b.grad with a BatchedTensor (and it also
     # returns the correct Jacobian)
-    jac(outputs=y, inputs=[b])
+    jac(y, inputs=[b])
 
     with raises(RuntimeError):
         # Using such a BatchedTensor should result in an error
@@ -328,7 +328,7 @@ def test_non_input_retaining_grad_fails():
     y = 3 * b
 
     # jac itself doesn't raise the error, but it fills b.grad with a BatchedTensor
-    jac(outputs=y, inputs=[a])
+    jac(y, inputs=[a])
 
     with raises(RuntimeError):
         # Using such a BatchedTensor should result in an error
diff --git a/tests/unit/autojac/test_mtl_backward.py b/tests/unit/autojac/test_mtl_backward.py

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ def __init__(self, trim_number: int):`
`24`	`24`	`)`
`25`	`25`	`self.trim_number = trim_number`
`26`	`26`
`27`		`- def forward(self, matrix: Tensor) -> Tensor:`
	`27`	`+ def forward(self, matrix: Tensor, /) -> Tensor:`
`28`	`28`	`self._check_matrix_has_enough_rows(matrix)`
`29`	`29`
`30`	`30`	`n_rows = matrix.shape[0]`