update

2026-01-29 07:22:12 +03:00 · 2025-06-19 19:22:06 +05:30
parent ace698aa96
commit 135934893e
1 changed files with 21 additions and 23 deletions
--- a/src/diffusers/hooks/group_offloading.py
+++ b/src/diffusers/hooks/group_offloading.py
@@ -135,9 +135,7 @@ class ModuleGroup:
        finally:
            pinned_dict = None

-    def _transfer_tensor_to_device(self, tensor, source_tensor=None, current_stream=None):
-        if source_tensor is None:
-            source_tensor = tensor
+    def _transfer_tensor_to_device(self, tensor, source_tensor, current_stream=None):
        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
        if self.record_stream and current_stream is not None:
            tensor.data.record_stream(current_stream)
@@ -159,26 +157,6 @@ class ModuleGroup:
            source = pinned_memory[buffer] if pinned_memory else buffer.data
            self._transfer_tensor_to_device(buffer, source, current_stream)

-    @torch.compiler.disable()
-    def onload_(self):
-        torch_accelerator_module = (
-            getattr(torch, torch.accelerator.current_accelerator().type)
-            if hasattr(torch, "accelerator")
-            else torch.cuda
-        )
-        context = nullcontext() if self.stream is None else torch_accelerator_module.stream(self.stream)
-        current_stream = torch_accelerator_module.current_stream() if self.record_stream else None
-
-        if self.stream is not None:
-            # Wait for previous Host->Device transfer to complete
-            self.stream.synchronize()
-
-        with context:
-            if self.offload_to_disk_path:
-                self._onload_from_disk(current_stream)
-            else:
-                self._onload_from_memory(current_stream)
-
    def _onload_from_disk(self, current_stream):
        if self.stream is not None:
            loaded_cpu_tensors = safetensors.torch.load_file(self.safetensors_file_path, device="cpu")
@@ -207,6 +185,26 @@ class ModuleGroup:
        else:
            self._process_tensors_from_modules(None, current_stream)

+    @torch.compiler.disable()
+    def onload_(self):
+        torch_accelerator_module = (
+            getattr(torch, torch.accelerator.current_accelerator().type)
+            if hasattr(torch, "accelerator")
+            else torch.cuda
+        )
+        context = nullcontext() if self.stream is None else torch_accelerator_module.stream(self.stream)
+        current_stream = torch_accelerator_module.current_stream() if self.record_stream else None
+
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
+
+        with context:
+            if self.offload_to_disk_path:
+                self._onload_from_disk(current_stream)
+            else:
+                self._onload_from_memory(current_stream)
+
    @torch.compiler.disable()
    def _offload_to_disk(self):
        # TODO: we can potentially optimize this code path by checking if the _all_ the desired