From 91545666e05384f4d6161d90fa9c306bc70f937e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 11 Jun 2025 22:41:59 +0530
Subject: [PATCH] [tests] model-level `device_map` clarifications (#11681)

* add clarity in documentation for device_map

* docs

* fix how compiler tester mixins are used.

* propagate

* more

* typo.

* fix tests

* fix order of decroators.

* clarify more.

* more test cases.

* fix doc

* fix device_map docstring in pipeline_utils.

* more examples

* more

* update

* remove code for stuff that is already supported.

* fix stuff.
---
 src/diffusers/models/modeling_utils.py        | 35 ++++++++++++++++--
 src/diffusers/pipelines/pipeline_utils.py     | 13 +++----
 .../unets/test_models_unet_2d_condition.py    | 37 +++++++++++++++++++
 3 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
index 55ce0cf79f..1e9e28471d 100644
--- a/src/diffusers/models/modeling_utils.py
+++ b/src/diffusers/models/modeling_utils.py
@@ -814,14 +814,43 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                 information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+            device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
                 A map that specifies where each submodule should go. It doesn't need to be defined for each
                 parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
                 same device. Defaults to `None`, meaning that the model will be loaded on CPU.
 
+                Examples:
+
+                ```py
+                >>> from diffusers import AutoModel
+                >>> import torch
+
+                >>> # This works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda"
+                ... )
+                >>> # This also works (integer accelerator device ID).
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0
+                ... )
+                >>> # Specifying a supported offloading strategy like "auto" also works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto"
+                ... )
+                >>> # Specifying a dictionary as `device_map` also works.
+                >>> model = AutoModel.from_pretrained(
+                ...     "stabilityai/stable-diffusion-xl-base-1.0",
+                ...     subfolder="unet",
+                ...     device_map={"": torch.device("cuda")},
+                ... )
+                ```
+
                 Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
                 more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+                map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You
+                can also refer to the [Diffusers-specific
+                documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding)
+                for more concrete examples.
             max_memory (`Dict`, *optional*):
                 A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
                 each GPU and the available CPU RAM if unset.
@@ -1387,7 +1416,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         low_cpu_mem_usage: bool = True,
         dtype: Optional[Union[str, torch.dtype]] = None,
         keep_in_fp32_modules: Optional[List[str]] = None,
-        device_map: Dict[str, Union[int, str, torch.device]] = None,
+        device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None,
         offload_state_dict: Optional[bool] = None,
         offload_folder: Optional[Union[str, os.PathLike]] = None,
         dduf_entries: Optional[Dict[str, DDUFEntry]] = None,
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index 0ac4251ec6..efeb085a72 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -669,14 +669,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                 Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                 information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
-                A map that specifies where each submodule should go. It doesn’t need to be defined for each
-                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device.
-
-                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
-                more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            device_map (`str`, *optional*):
+                Strategy that dictates how the different components of a pipeline should be placed on available
+                devices. Currently, only "balanced" `device_map` is supported. Check out
+                [this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement)
+                to know more.
             max_memory (`Dict`, *optional*):
                 A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
                 each GPU and the available CPU RAM if unset.
diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py
index c8ed68c65b..e0331d15dd 100644
--- a/tests/models/unets/test_models_unet_2d_condition.py
+++ b/tests/models/unets/test_models_unet_2d_condition.py
@@ -46,6 +46,7 @@ from diffusers.utils.testing_utils import (
     require_peft_backend,
     require_torch_accelerator,
     require_torch_accelerator_with_fp16,
+    require_torch_gpu,
     skip_mps,
     slow,
     torch_all_close,
@@ -1083,6 +1084,42 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test
         assert loaded_model
         assert new_output.sample.shape == (4, 4, 16, 16)
 
+    @parameterized.expand(
+        [
+            (-1, "You can't pass device_map as a negative int"),
+            ("foo", "When passing device_map as a string, the value needs to be a device name"),
+        ]
+    )
+    def test_wrong_device_map_raises_error(self, device_map, msg_substring):
+        with self.assertRaises(ValueError) as err_ctx:
+            _ = self.model_class.from_pretrained(
+                "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map
+            )
+
+        assert msg_substring in str(err_ctx.exception)
+
+    @parameterized.expand([0, "cuda", torch.device("cuda"), torch.device("cuda:0")])
+    @require_torch_gpu
+    def test_passing_non_dict_device_map_works(self, device_map):
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained(
+            "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map
+        )
+        output = loaded_model(**inputs_dict)
+        assert output.sample.shape == (4, 4, 16, 16)
+
+    @parameterized.expand([("", "cuda"), ("", torch.device("cuda"))])
+    @require_torch_gpu
+    def test_passing_dict_device_map_works(self, name, device_map):
+        # There are other valid dict-based `device_map` values too. It's best to refer to
+        # the docs for those: https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap.
+        _, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        loaded_model = self.model_class.from_pretrained(
+            "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map={name: device_map}
+        )
+        output = loaded_model(**inputs_dict)
+        assert output.sample.shape == (4, 4, 16, 16)
+
     @require_peft_backend
     def test_load_attn_procs_raise_warning(self):
         init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()