From 8b27585f9d2f82b7b48206ec7d0139565ef524a1 Mon Sep 17 00:00:00 2001 From: Chase Xu <80196056+Chase-Xuu@users.noreply.github.com> Date: Mon, 9 Mar 2026 20:43:08 -0500 Subject: [PATCH] fix: restore module buffers during stream-based group offload In `_offload_to_memory()`, when using CUDA streams, module buffers from `self.modules` were not being restored to their CPU copies. This created an asymmetry with `_build_cpu_param_dict()` and `_process_tensors_from_modules()` (onload), which both handle `group_module.buffers()`. The missing buffer restoration could cause: - Stale buffer data on subsequent onload cycles - Memory leaks (GPU tensors not released) - Potential NaN values in models with stateful buffers (e.g., normalization layers) when used with `record_stream=True` Fixes the stream path to match the non-stream path, which correctly moves all module state via `group_module.to()`. Related: #12613 Signed-off-by: Chase Xu Signed-off-by: Chase Xu <80196056+Chase-Xuu@users.noreply.github.com> --- src/diffusers/hooks/group_offloading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py index 891ac28455af..30c0834a9e05 100644 --- a/src/diffusers/hooks/group_offloading.py +++ b/src/diffusers/hooks/group_offloading.py @@ -246,6 +246,8 @@ def _offload_to_memory(self): for group_module in self.modules: for param in group_module.parameters(): param.data = self.cpu_param_dict[param] + for buffer in group_module.buffers(): + buffer.data = self.cpu_param_dict[buffer] for param in self.parameters: param.data = self.cpu_param_dict[param] for buffer in self.buffers: