From 7a587349943325e667866971a36996a56fcff143 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Tue, 23 Sep 2025 21:01:45 -0700
Subject: [PATCH 1/2] xpu enabling for 4 cases (#12345)

Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
---
 .../modular_pipelines/components_manager.py         | 13 ++++++++++---
 tests/lora/test_lora_layers_hunyuanvideo.py         |  1 +
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py
index f48a227e2e..ed847fa414 100644
--- a/src/diffusers/modular_pipelines/components_manager.py
+++ b/src/diffusers/modular_pipelines/components_manager.py
@@ -25,6 +25,7 @@ from ..utils import (
     is_accelerate_available,
     logging,
 )
+from ..utils.torch_utils import get_device
 
 
 if is_accelerate_available():
@@ -161,7 +162,9 @@ class AutoOffloadStrategy:
 
         current_module_size = model.get_memory_footprint()
 
-        mem_on_device = torch.cuda.mem_get_info(execution_device.index)[0]
+        device_type = execution_device.type
+        device_module = getattr(torch, device_type, torch.cuda)
+        mem_on_device = device_module.mem_get_info(execution_device.index)[0]
         mem_on_device = mem_on_device - self.memory_reserve_margin
         if current_module_size < mem_on_device:
             return []
@@ -301,7 +304,7 @@ class ComponentsManager:
         cm.add("vae", vae_model, collection="sdxl")
 
         # Enable auto offloading
-        cm.enable_auto_cpu_offload(device="cuda")
+        cm.enable_auto_cpu_offload()
 
         # Retrieve components
         unet = cm.get_one(name="unet", collection="sdxl")
@@ -490,6 +493,8 @@ class ComponentsManager:
             gc.collect()
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
+            if torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
     # YiYi TODO: rename to search_components for now, may remove this method
     def search_components(
@@ -678,7 +683,7 @@ class ComponentsManager:
 
         return get_return_dict(matches, return_dict_with_names)
 
-    def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = "cuda", memory_reserve_margin="3GB"):
+    def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = None, memory_reserve_margin="3GB"):
         """
         Enable automatic CPU offloading for all components.
 
@@ -704,6 +709,8 @@ class ComponentsManager:
 
         self.disable_auto_cpu_offload()
         offload_strategy = AutoOffloadStrategy(memory_reserve_margin=memory_reserve_margin)
+        if device is None:
+            device = get_device()
         device = torch.device(device)
         if device.index is None:
             device = torch.device(f"{device.type}:{0}")
diff --git a/tests/lora/test_lora_layers_hunyuanvideo.py b/tests/lora/test_lora_layers_hunyuanvideo.py
index 7ea0f1fcc9..cfd5d3146a 100644
--- a/tests/lora/test_lora_layers_hunyuanvideo.py
+++ b/tests/lora/test_lora_layers_hunyuanvideo.py
@@ -253,6 +253,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
         expected_slices = Expectations(
             {
                 ("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]),
+                ("xpu", 3): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]),
             }
         )
         # fmt: on

From 08c29020dd558899a53137441f908ab668c427d6 Mon Sep 17 00:00:00 2001
From: Yao Matrix <matrix.yao@intel.com>
Date: Tue, 23 Sep 2025 21:02:06 -0700
Subject: [PATCH 2/2] fix marigold ut case fail on xpu (#12350)

Signed-off-by: Yao, Matrix <matrix.yao@intel.com>
---
 .../pipelines/marigold/test_marigold_depth.py | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py
index 3e8ccbf5c0..3c85305992 100644
--- a/tests/pipelines/marigold/test_marigold_depth.py
+++ b/tests/pipelines/marigold/test_marigold_depth.py
@@ -33,6 +33,7 @@ from diffusers import (
 )
 
 from ...testing_utils import (
+    Expectations,
     backend_empty_cache,
     enable_full_determinism,
     floats_tensor,
@@ -356,7 +357,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self):
+    def test_marigold_depth_einstein_f32_accelerator_G0_S1_P768_E1_B1_M1(self):
         self._test_marigold_depth(
             is_fp16=False,
             device=torch_device,
@@ -369,7 +370,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E1_B1_M1(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
@@ -382,7 +383,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G2024_S1_P768_E1_B1_M1(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
@@ -395,12 +396,23 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S2_P768_E1_B1_M1(self):
+        # fmt: off
+        expected_slices = Expectations(
+            {
+                ("cuda", 7): np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]),
+                ("xpu", 3): np.array([0.1084, 0.1096, 0.1108, 0.1080, 0.1083, 0.1080,
+ 0.1085, 0.1057, 0.0996]),
+            }
+        )
+        expected_slice = expected_slices.get_expectation()
+        # fmt: on
+
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
             generator_seed=0,
-            expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]),
+            expected_slice=expected_slice,
             num_inference_steps=2,
             processing_resolution=768,
             ensemble_size=1,
@@ -408,7 +420,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M1(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
@@ -421,7 +433,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E3_B1_M1(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
@@ -435,7 +447,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E4_B2_M1(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,
@@ -449,7 +461,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase):
             match_input_resolution=True,
         )
 
-    def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self):
+    def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M0(self):
         self._test_marigold_depth(
             is_fp16=True,
             device=torch_device,