[tests] make tests device-agnostic (part 3) (#10437)

* initial comit * fix empty cache * fix one more * fix style * update device functions * update * update * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update src/diffusers/utils/testing_utils.py Co-authored-by: hlky <hlky@hlky.ac> * Update tests/pipelines/controlnet/test_controlnet.py Co-authored-by: hlky <hlky@hlky.ac> * with gc.collect * update * make style * check_torch_dependencies * add mps empty cache * bug fix * Apply suggestions from code review --------- Co-authored-by: hlky <hlky@hlky.ac>
2026-01-27 17:22:53 +03:00 · 2025-01-21 20:15:45 +08:00
parent 158a5a87fb
commit ec37e20972
26 changed files with 275 additions and 170 deletions
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -86,7 +86,12 @@ if is_torch_available():
            ) from e
        logger.info(f"torch_device overrode to {torch_device}")
    else:
-        torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        if torch.cuda.is_available():
+            torch_device = "cuda"
+        elif torch.xpu.is_available():
+            torch_device = "xpu"
+        else:
+            torch_device = "cpu"
        is_torch_higher_equal_than_1_12 = version.parse(
            version.parse(torch.__version__).base_version
        ) >= version.parse("1.12")
@@ -1067,12 +1072,51 @@ def _is_torch_fp64_available(device):
 # Guard these lookups for when Torch is not used - alternative accelerator support is for PyTorch
 if is_torch_available():
    # Behaviour flags
-    BACKEND_SUPPORTS_TRAINING = {"cuda": True, "cpu": True, "mps": False, "default": True}
+    BACKEND_SUPPORTS_TRAINING = {"cuda": True, "xpu": True, "cpu": True, "mps": False, "default": True}

    # Function definitions
-    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "mps": None, "default": None}
-    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "mps": lambda: 0, "default": 0}
-    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
+    BACKEND_EMPTY_CACHE = {
+        "cuda": torch.cuda.empty_cache,
+        "xpu": torch.xpu.empty_cache,
+        "cpu": None,
+        "mps": torch.mps.empty_cache,
+        "default": None,
+    }
+    BACKEND_DEVICE_COUNT = {
+        "cuda": torch.cuda.device_count,
+        "xpu": torch.xpu.device_count,
+        "cpu": lambda: 0,
+        "mps": lambda: 0,
+        "default": 0,
+    }
+    BACKEND_MANUAL_SEED = {
+        "cuda": torch.cuda.manual_seed,
+        "xpu": torch.xpu.manual_seed,
+        "cpu": torch.manual_seed,
+        "mps": torch.mps.manual_seed,
+        "default": torch.manual_seed,
+    }
+    BACKEND_RESET_PEAK_MEMORY_STATS = {
+        "cuda": torch.cuda.reset_peak_memory_stats,
+        "xpu": getattr(torch.xpu, "reset_peak_memory_stats", None),
+        "cpu": None,
+        "mps": None,
+        "default": None,
+    }
+    BACKEND_RESET_MAX_MEMORY_ALLOCATED = {
+        "cuda": torch.cuda.reset_max_memory_allocated,
+        "xpu": None,
+        "cpu": None,
+        "mps": None,
+        "default": None,
+    }
+    BACKEND_MAX_MEMORY_ALLOCATED = {
+        "cuda": torch.cuda.max_memory_allocated,
+        "xpu": getattr(torch.xpu, "max_memory_allocated", None),
+        "cpu": 0,
+        "mps": 0,
+        "default": 0,
+    }


 # This dispatches a defined function according to the accelerator from the function definitions.
@@ -1103,6 +1147,18 @@ def backend_device_count(device: str):
    return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT)


+def backend_reset_peak_memory_stats(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_RESET_PEAK_MEMORY_STATS)
+
+
+def backend_reset_max_memory_allocated(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_RESET_MAX_MEMORY_ALLOCATED)
+
+
+def backend_max_memory_allocated(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_MAX_MEMORY_ALLOCATED)
+
+
 # These are callables which return boolean behaviour flags and can be used to specify some
 # device agnostic alternative where the feature is unsupported.
 def backend_supports_training(device: str):
@@ -1159,3 +1215,6 @@ if is_torch_available():
        update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN")
        update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN")
        update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING")
+        update_mapping_from_spec(BACKEND_RESET_PEAK_MEMORY_STATS, "RESET_PEAK_MEMORY_STATS_FN")
+        update_mapping_from_spec(BACKEND_RESET_MAX_MEMORY_ALLOCATED, "RESET_MAX_MEMORY_ALLOCATED_FN")
+        update_mapping_from_spec(BACKEND_MAX_MEMORY_ALLOCATED, "MAX_MEMORY_ALLOCATED_FN")
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -57,8 +57,8 @@ from diffusers.utils.testing_utils import (
    get_python_version,
    is_torch_compile,
    require_torch_2,
+    require_torch_accelerator,
    require_torch_accelerator_with_training,
-    require_torch_gpu,
    require_torch_multi_gpu,
    run_test_in_subprocess,
    torch_all_close,
@@ -543,7 +543,7 @@ class ModelTesterMixin:
        assert torch.allclose(output, output_3, atol=self.base_precision)
        assert torch.allclose(output_2, output_3, atol=self.base_precision)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_set_attn_processor_for_determinism(self):
        if self.uses_custom_attn_processor:
            return
@@ -1068,7 +1068,7 @@ class ModelTesterMixin:

            self.assertTrue(f"Adapter name {wrong_name} not found in the model." in str(err_context.exception))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_cpu_offload(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
@@ -1098,7 +1098,7 @@ class ModelTesterMixin:

                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_disk_offload_without_safetensors(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
@@ -1132,7 +1132,7 @@ class ModelTesterMixin:

            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_disk_offload_with_safetensors(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
@@ -1191,7 +1191,7 @@ class ModelTesterMixin:

                self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_sharded_checkpoints(self):
        torch.manual_seed(0)
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -1223,7 +1223,7 @@ class ModelTesterMixin:

            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_sharded_checkpoints_with_variant(self):
        torch.manual_seed(0)
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -1261,7 +1261,7 @@ class ModelTesterMixin:

            self.assertTrue(torch.allclose(base_output[0], new_output[0], atol=1e-5))

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_sharded_checkpoints_device_map(self):
        config, inputs_dict = self.prepare_init_args_and_inputs_for_common()
        model = self.model_class(**config).eval()
--- a/tests/pipelines/allegro/test_allegro.py
+++ b/tests/pipelines/allegro/test_allegro.py
@@ -27,7 +27,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    numpy_cosine_similarity_distance,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    slow,
    torch_device,
@@ -332,7 +332,7 @@ class AllegroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class AllegroPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."

@@ -350,7 +350,7 @@ class AllegroPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator("cpu").manual_seed(0)

        pipe = AllegroPipeline.from_pretrained("rhymes-ai/Allegro", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt

        videos = pipe(
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -20,9 +20,10 @@ from diffusers import (
 from diffusers.models.attention import FreeNoiseTransformerBlock
 from diffusers.utils import is_xformers_available, logging
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    numpy_cosine_similarity_distance,
    require_accelerator,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -547,19 +548,19 @@ class AnimateDiffPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class AnimateDiffPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_animatediff(self):
        adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
@@ -573,7 +574,7 @@ class AnimateDiffPipelineSlowTests(unittest.TestCase):
            clip_sample=False,
        )
        pipe.enable_vae_slicing()
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        prompt = "night, b&w photo of old house, post apocalypse, forest, storm weather, wind, rocks, 8k uhd, dslr, soft lighting, high quality, film grain"
--- a/tests/pipelines/cogvideo/test_cogvideox.py
+++ b/tests/pipelines/cogvideo/test_cogvideox.py
@@ -24,7 +24,7 @@ from diffusers import AutoencoderKLCogVideoX, CogVideoXPipeline, CogVideoXTransf
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -321,7 +321,7 @@ class CogVideoXPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class CogVideoXPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."

@@ -339,7 +339,7 @@ class CogVideoXPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator("cpu").manual_seed(0)

        pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt

        videos = pipe(
--- a/tests/pipelines/cogvideo/test_cogvideox_image2video.py
+++ b/tests/pipelines/cogvideo/test_cogvideox_image2video.py
@@ -24,9 +24,10 @@ from transformers import AutoTokenizer, T5EncoderModel
 from diffusers import AutoencoderKLCogVideoX, CogVideoXImageToVideoPipeline, CogVideoXTransformer3DModel, DDIMScheduler
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -344,25 +345,25 @@ class CogVideoXImageToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestC


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class CogVideoXImageToVideoPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."

    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_cogvideox(self):
        generator = torch.Generator("cpu").manual_seed(0)

        pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V", torch_dtype=torch.bfloat16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        prompt = self.prompt
        image = load_image(
--- a/tests/pipelines/cogview3/test_cogview3plus.py
+++ b/tests/pipelines/cogview3/test_cogview3plus.py
@@ -24,7 +24,7 @@ from diffusers import AutoencoderKL, CogVideoXDDIMScheduler, CogView3PlusPipelin
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -232,7 +232,7 @@ class CogView3PlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class CogView3PlusPipelineIntegrationTests(unittest.TestCase):
    prompt = "A painting of a squirrel eating a burger."

@@ -250,7 +250,7 @@ class CogView3PlusPipelineIntegrationTests(unittest.TestCase):
        generator = torch.Generator("cpu").manual_seed(0)

        pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3Plus-3b", torch_dtype=torch.float16)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt

        images = pipe(
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -34,13 +34,17 @@ from diffusers import (
 from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    enable_full_determinism,
    get_python_version,
    is_torch_compile,
    load_image,
    load_numpy,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    run_test_in_subprocess,
    slow,
    torch_device,
@@ -703,17 +707,17 @@ class StableDiffusionMultiControlNetOneModelPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class ControlNetPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
@@ -721,7 +725,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -748,7 +752,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -775,7 +779,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -802,7 +806,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -829,7 +833,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -856,7 +860,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -883,7 +887,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(5)
@@ -910,7 +914,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(5)
@@ -932,9 +936,9 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        assert np.abs(expected_image - image).max() < 8e-2

    def test_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")

@@ -943,7 +947,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        )
        pipe.set_progress_bar_config(disable=None)
        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)

        prompt = "house"
        image = load_image(
@@ -957,7 +961,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
            output_type="np",
        )

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        # make sure that less than 7 GB is allocated
        assert mem_bytes < 4 * 10**9

@@ -967,7 +971,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -1000,7 +1004,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -1041,7 +1045,7 @@ class ControlNetPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -1068,17 +1072,17 @@ class ControlNetPipelineSlowTests(unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_pose_and_canny(self):
        controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
@@ -1089,7 +1093,7 @@ class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
            safety_checker=None,
            controlnet=[controlnet_pose, controlnet_canny],
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/controlnet/test_controlnet_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_img2img.py
@@ -39,7 +39,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    load_numpy,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -393,7 +393,7 @@ class StableDiffusionMultiControlNetPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -411,7 +411,7 @@ class ControlNetImg2ImgPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/controlnet/test_controlnet_inpaint.py
+++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py
@@ -40,7 +40,7 @@ from diffusers.utils.testing_utils import (
    floats_tensor,
    load_numpy,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -445,7 +445,7 @@ class MultiControlNetInpaintPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
@@ -463,7 +463,7 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
            "botp/stable-diffusion-v1-5-inpainting", safety_checker=None, controlnet=controlnet
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -509,7 +509,7 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
            "stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(33)
--- a/tests/pipelines/controlnet/test_controlnet_sdxl.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl.py
@@ -35,9 +35,10 @@ from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D
 from diffusers.pipelines.controlnet.pipeline_controlnet import MultiControlNetModel
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    load_image,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -212,7 +213,7 @@ class StableDiffusionXLControlNetPipelineFastTests(
    def test_save_load_optional_components(self):
        self._test_save_load_optional_components()

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -893,17 +894,17 @@ class StableDiffusionXLMultiControlNetOneModelPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = ControlNetModel.from_pretrained("diffusers/controlnet-canny-sdxl-1.0")
@@ -911,7 +912,7 @@ class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
        )
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -934,7 +935,7 @@ class ControlNetSDXLPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet
        )
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
+++ b/tests/pipelines/controlnet/test_controlnet_sdxl_img2img.py
@@ -28,7 +28,12 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    require_torch_accelerator,
+    torch_device,
+)

 from ..pipeline_params import (
    IMAGE_TO_IMAGE_IMAGE_PARAMS,
@@ -241,7 +246,7 @@ class ControlNetPipelineSDXLImg2ImgFastTests(
    def test_save_load_optional_components(self):
        pass

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
        components = self.get_dummy_components()
@@ -250,12 +255,12 @@ class ControlNetPipelineSDXLImg2ImgFastTests(

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
--- a/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
+++ b/tests/pipelines/controlnet_hunyuandit/test_controlnet_hunyuandit.py
@@ -29,8 +29,9 @@ from diffusers import (
 from diffusers.models import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -178,19 +179,19 @@ class HunyuanDiTControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMix


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
    pipeline_class = HunyuanDiTControlNetPipeline

    def setUp(self):
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = HunyuanDiT2DControlNetModel.from_pretrained(
@@ -199,7 +200,7 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -238,7 +239,7 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -277,7 +278,7 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -318,7 +319,7 @@ class HunyuanDiTControlNetPipelineSlowTests(unittest.TestCase):
        pipe = HunyuanDiTControlNetPipeline.from_pretrained(
            "Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/controlnet_xs/test_controlnetxs.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs.py
@@ -34,13 +34,14 @@ from diffusers import (
 )
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    is_torch_compile,
    load_image,
    load_numpy,
    require_accelerator,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    run_test_in_subprocess,
    slow,
    torch_device,
@@ -92,7 +93,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout):
            safety_checker=None,
            torch_dtype=torch.float16,
        )
-        pipe.to("cuda")
+        pipe.to(torch_device)
        pipe.set_progress_bar_config(disable=None)

        pipe.unet.to(memory_format=torch.channels_last)
@@ -334,12 +335,12 @@ class ControlNetXSPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class ControlNetXSPipelineSlowTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = ControlNetXSAdapter.from_pretrained(
@@ -348,7 +349,7 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -374,7 +375,7 @@ class ControlNetXSPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
            "stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
+++ b/tests/pipelines/controlnet_xs/test_controlnetxs_sdxl.py
@@ -31,7 +31,14 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    enable_full_determinism,
+    load_image,
+    require_torch_accelerator,
+    slow,
+    torch_device,
+)
 from diffusers.utils.torch_utils import randn_tensor

 from ...models.autoencoders.vae import (
@@ -192,7 +199,7 @@ class StableDiffusionXLControlNetXSPipelineFastTests(
    def test_inference_batch_single_identical(self):
        self._test_inference_batch_single_identical(expected_max_diff=2e-3)

-    @require_torch_gpu
+    @require_torch_accelerator
    # Copied from test_controlnet_sdxl.py
    def test_stable_diffusion_xl_offloads(self):
        pipes = []
@@ -202,12 +209,12 @@ class StableDiffusionXLControlNetXSPipelineFastTests(

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_model_cpu_offload()
+        sd_pipe.enable_model_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        components = self.get_dummy_components()
        sd_pipe = self.pipeline_class(**components)
-        sd_pipe.enable_sequential_cpu_offload()
+        sd_pipe.enable_sequential_cpu_offload(device=torch_device)
        pipes.append(sd_pipe)

        image_slices = []
@@ -369,12 +376,12 @@ class StableDiffusionXLControlNetXSPipelineFastTests(


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
    def tearDown(self):
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_canny(self):
        controlnet = ControlNetXSAdapter.from_pretrained(
@@ -383,7 +390,7 @@ class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -407,7 +414,7 @@ class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
        pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
        )
-        pipe.enable_sequential_cpu_offload()
+        pipe.enable_sequential_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)

        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/ddim/test_ddim.py
+++ b/tests/pipelines/ddim/test_ddim.py
@@ -19,7 +19,7 @@ import numpy as np
 import torch

 from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, slow, torch_device

 from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin
@@ -99,7 +99,7 @@ class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class DDIMPipelineIntegrationTests(unittest.TestCase):
    def test_inference_cifar10(self):
        model_id = "google/ddpm-cifar10-32"
--- a/tests/pipelines/ddpm/test_ddpm.py
+++ b/tests/pipelines/ddpm/test_ddpm.py
@@ -19,7 +19,7 @@ import numpy as np
 import torch

 from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, slow, torch_device


 enable_full_determinism()
@@ -88,7 +88,7 @@ class DDPMPipelineFastTests(unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class DDPMPipelineIntegrationTests(unittest.TestCase):
    def test_inference_cifar10(self):
        model_id = "google/ddpm-cifar10-32"
--- a/tests/pipelines/deepfloyd_if/test_if.py
+++ b/tests/pipelines/deepfloyd_if/test_if.py
@@ -24,10 +24,13 @@ from diffusers import (
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -98,28 +101,28 @@ class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.T


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_text_to_image(self):
        pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_max_memory_allocated(torch_device)
+        backend_empty_cache(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        generator = torch.Generator(device="cpu").manual_seed(0)
        output = pipe(
--- a/tests/pipelines/deepfloyd_if/test_if_img2img.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py
@@ -23,11 +23,14 @@ from diffusers import IFImg2ImgPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -109,19 +112,19 @@ class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, uni


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFImg2ImgPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_img2img(self):
        pipe = IFImg2ImgPipeline.from_pretrained(
@@ -130,11 +133,11 @@ class IFImg2ImgPipelineSlowTests(unittest.TestCase):
            torch_dtype=torch.float16,
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_max_memory_allocated(torch_device)
+        backend_empty_cache(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
        generator = torch.Generator(device="cpu").manual_seed(0)
--- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py
@@ -23,11 +23,15 @@ from diffusers import IFImg2ImgSuperResolutionPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -106,19 +110,19 @@ class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineT


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_img2img_superresolution(self):
        pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
@@ -127,11 +131,11 @@ class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase):
            torch_dtype=torch.float16,
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
+        backend_reset_max_memory_allocated(torch_device)
+        backend_empty_cache(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        generator = torch.Generator(device="cpu").manual_seed(0)

@@ -151,7 +155,8 @@ class IFImg2ImgSuperResolutionPipelineSlowTests(unittest.TestCase):

        assert image.shape == (256, 256, 3)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
+
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py
@@ -23,11 +23,15 @@ from diffusers import IFInpaintingPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -106,30 +110,30 @@ class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin,


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFInpaintingPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_inpainting(self):
        pipe = IFInpaintingPipeline.from_pretrained(
            "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
        mask_image = floats_tensor((1, 3, 64, 64), rng=random.Random(1)).to(torch_device)
@@ -145,7 +149,7 @@ class IFInpaintingPipelineSlowTests(unittest.TestCase):
        )
        image = output.images[0]

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(
--- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py
@@ -23,11 +23,15 @@ from diffusers import IFInpaintingSuperResolutionPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -108,31 +112,31 @@ class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipeli


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_inpainting_superresolution(self):
        pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        # Super resolution test
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        generator = torch.Generator(device="cpu").manual_seed(0)

@@ -154,7 +158,7 @@ class IFInpaintingSuperResolutionPipelineSlowTests(unittest.TestCase):

        assert image.shape == (256, 256, 3)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(
--- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py
+++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py
@@ -23,11 +23,15 @@ from diffusers import IFSuperResolutionPipeline
 from diffusers.models.attention_processor import AttnAddedKVProcessor
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
+    backend_max_memory_allocated,
+    backend_reset_max_memory_allocated,
+    backend_reset_peak_memory_stats,
    floats_tensor,
    load_numpy,
    require_accelerator,
    require_hf_hub_version_greater,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    skip_mps,
    slow,
@@ -101,31 +105,31 @@ class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMi


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class IFSuperResolutionPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_if_superresolution(self):
        pipe = IFSuperResolutionPipeline.from_pretrained(
            "DeepFloyd/IF-II-L-v1.0", variant="fp16", torch_dtype=torch.float16
        )
        pipe.unet.set_attn_processor(AttnAddedKVProcessor())
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)

        # Super resolution test
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
+        backend_empty_cache(torch_device)
+        backend_reset_max_memory_allocated(torch_device)
+        backend_reset_peak_memory_stats(torch_device)

        image = floats_tensor((1, 3, 64, 64), rng=random.Random(0)).to(torch_device)
        generator = torch.Generator(device="cpu").manual_seed(0)
@@ -141,7 +145,7 @@ class IFSuperResolutionPipelineSlowTests(unittest.TestCase):

        assert image.shape == (256, 256, 3)

-        mem_bytes = torch.cuda.max_memory_allocated()
+        mem_bytes = backend_max_memory_allocated(torch_device)
        assert mem_bytes < 12 * 10**9

        expected_image = load_numpy(
--- a/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
+++ b/tests/pipelines/hunyuan_dit/test_hunyuan_dit.py
@@ -30,7 +30,7 @@ from diffusers import (
 from diffusers.utils.testing_utils import (
    enable_full_determinism,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    slow,
    torch_device,
 )
@@ -299,7 +299,7 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class HunyuanDiTPipelineIntegrationTests(unittest.TestCase):
    prompt = "一个宇航员在骑马"

@@ -319,7 +319,7 @@ class HunyuanDiTPipelineIntegrationTests(unittest.TestCase):
        pipe = HunyuanDiTPipeline.from_pretrained(
            "XCLiu/HunyuanDiT-0523", revision="refs/pr/2", torch_dtype=torch.float16
        )
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        prompt = self.prompt

        image = pipe(
--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -36,10 +36,11 @@ from diffusers import (
 from diffusers.models.unets import I2VGenXLUNet
 from diffusers.utils import is_xformers_available, load_image
 from diffusers.utils.testing_utils import (
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    numpy_cosine_similarity_distance,
-    require_torch_gpu,
+    require_torch_accelerator,
    skip_mps,
    slow,
    torch_device,
@@ -228,23 +229,23 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class I2VGenXLPipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_i2vgen_xl(self):
        pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.set_progress_bar_config(disable=None)
        image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
--- a/tests/pipelines/test_pipelines.py
+++ b/tests/pipelines/test_pipelines.py
@@ -66,6 +66,7 @@ from diffusers.utils import (
 )
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    backend_empty_cache,
    enable_full_determinism,
    floats_tensor,
    get_python_version,
@@ -78,7 +79,7 @@ from diffusers.utils.testing_utils import (
    require_hf_hub_version_greater,
    require_onnxruntime,
    require_torch_2,
-    require_torch_gpu,
+    require_torch_accelerator,
    require_transformers_version_greater,
    run_test_in_subprocess,
    slow,
@@ -1150,7 +1151,7 @@ class CustomPipelineTests(unittest.TestCase):
        assert conf_1 == conf_2

    @slow
-    @require_torch_gpu
+    @require_torch_accelerator
    def test_download_from_git(self):
        # Because adaptive_avg_pool2d_backward_cuda
        # does not have a deterministic implementation.
@@ -1364,7 +1365,7 @@ class PipelineFastTests(unittest.TestCase):
        assert image_img2img.shape == (1, 32, 32, 3)
        assert image_text2img.shape == (1, 64, 64, 3)

-    @require_torch_gpu
+    @require_torch_accelerator
    def test_pipe_false_offload_warn(self):
        unet = self.dummy_cond_unet()
        scheduler = PNDMScheduler(skip_prk_steps=True)
@@ -1898,19 +1899,19 @@ class PipelineFastTests(unittest.TestCase):


@slow
-@require_torch_gpu
+@require_torch_accelerator
 class PipelineSlowTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_smart_download(self):
        model_id = "hf-internal-testing/unet-pipeline-dummy"
@@ -2102,7 +2103,7 @@ class PipelineSlowTests(unittest.TestCase):

        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
+        pipe.enable_model_cpu_offload(device=torch_device)
        pipe.enable_attention_slicing()

        compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
@@ -2129,19 +2130,19 @@ class PipelineSlowTests(unittest.TestCase):


@nightly
-@require_torch_gpu
+@require_torch_accelerator
 class PipelineNightlyTests(unittest.TestCase):
    def setUp(self):
        # clean up the VRAM before each test
        super().setUp()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def tearDown(self):
        # clean up the VRAM after each test
        super().tearDown()
        gc.collect()
-        torch.cuda.empty_cache()
+        backend_empty_cache(torch_device)

    def test_ddpm_ddim_equality_batched(self):
        seed = 0