From 7a587349943325e667866971a36996a56fcff143 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 23 Sep 2025 21:01:45 -0700 Subject: [PATCH 1/2] xpu enabling for 4 cases (#12345) Signed-off-by: Yao, Matrix --- .../modular_pipelines/components_manager.py | 13 ++++++++++--- tests/lora/test_lora_layers_hunyuanvideo.py | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py index f48a227e2e..ed847fa414 100644 --- a/src/diffusers/modular_pipelines/components_manager.py +++ b/src/diffusers/modular_pipelines/components_manager.py @@ -25,6 +25,7 @@ from ..utils import ( is_accelerate_available, logging, ) +from ..utils.torch_utils import get_device if is_accelerate_available(): @@ -161,7 +162,9 @@ class AutoOffloadStrategy: current_module_size = model.get_memory_footprint() - mem_on_device = torch.cuda.mem_get_info(execution_device.index)[0] + device_type = execution_device.type + device_module = getattr(torch, device_type, torch.cuda) + mem_on_device = device_module.mem_get_info(execution_device.index)[0] mem_on_device = mem_on_device - self.memory_reserve_margin if current_module_size < mem_on_device: return [] @@ -301,7 +304,7 @@ class ComponentsManager: cm.add("vae", vae_model, collection="sdxl") # Enable auto offloading - cm.enable_auto_cpu_offload(device="cuda") + cm.enable_auto_cpu_offload() # Retrieve components unet = cm.get_one(name="unet", collection="sdxl") @@ -490,6 +493,8 @@ class ComponentsManager: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() + if torch.xpu.is_available(): + torch.xpu.empty_cache() # YiYi TODO: rename to search_components for now, may remove this method def search_components( @@ -678,7 +683,7 @@ class ComponentsManager: return get_return_dict(matches, return_dict_with_names) - def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = "cuda", memory_reserve_margin="3GB"): + def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = None, memory_reserve_margin="3GB"): """ Enable automatic CPU offloading for all components. @@ -704,6 +709,8 @@ class ComponentsManager: self.disable_auto_cpu_offload() offload_strategy = AutoOffloadStrategy(memory_reserve_margin=memory_reserve_margin) + if device is None: + device = get_device() device = torch.device(device) if device.index is None: device = torch.device(f"{device.type}:{0}") diff --git a/tests/lora/test_lora_layers_hunyuanvideo.py b/tests/lora/test_lora_layers_hunyuanvideo.py index 7ea0f1fcc9..cfd5d3146a 100644 --- a/tests/lora/test_lora_layers_hunyuanvideo.py +++ b/tests/lora/test_lora_layers_hunyuanvideo.py @@ -253,6 +253,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): expected_slices = Expectations( { ("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]), + ("xpu", 3): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]), } ) # fmt: on From 08c29020dd558899a53137441f908ab668c427d6 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 23 Sep 2025 21:02:06 -0700 Subject: [PATCH 2/2] fix marigold ut case fail on xpu (#12350) Signed-off-by: Yao, Matrix --- .../pipelines/marigold/test_marigold_depth.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py index 3e8ccbf5c0..3c85305992 100644 --- a/tests/pipelines/marigold/test_marigold_depth.py +++ b/tests/pipelines/marigold/test_marigold_depth.py @@ -33,6 +33,7 @@ from diffusers import ( ) from ...testing_utils import ( + Expectations, backend_empty_cache, enable_full_determinism, floats_tensor, @@ -356,7 +357,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f32_accelerator_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=False, device=torch_device, @@ -369,7 +370,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -382,7 +383,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -395,12 +396,23 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S2_P768_E1_B1_M1(self): + # fmt: off + expected_slices = Expectations( + { + ("cuda", 7): np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), + ("xpu", 3): np.array([0.1084, 0.1096, 0.1108, 0.1080, 0.1083, 0.1080, + 0.1085, 0.1057, 0.0996]), + } + ) + expected_slice = expected_slices.get_expectation() + # fmt: on + self._test_marigold_depth( is_fp16=True, device=torch_device, generator_seed=0, - expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), + expected_slice=expected_slice, num_inference_steps=2, processing_resolution=768, ensemble_size=1, @@ -408,7 +420,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -421,7 +433,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E3_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -435,7 +447,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E4_B2_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -449,7 +461,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M0(self): self._test_marigold_depth( is_fp16=True, device=torch_device,