From 91545666e05384f4d6161d90fa9c306bc70f937e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 11 Jun 2025 22:41:59 +0530 Subject: [PATCH] [tests] model-level `device_map` clarifications (#11681) * add clarity in documentation for device_map * docs * fix how compiler tester mixins are used. * propagate * more * typo. * fix tests * fix order of decroators. * clarify more. * more test cases. * fix doc * fix device_map docstring in pipeline_utils. * more examples * more * update * remove code for stuff that is already supported. * fix stuff. --- src/diffusers/models/modeling_utils.py | 35 ++++++++++++++++-- src/diffusers/pipelines/pipeline_utils.py | 13 +++---- .../unets/test_models_unet_2d_condition.py | 37 +++++++++++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 55ce0cf79f..1e9e28471d 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -814,14 +814,43 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): Mirror source to resolve accessibility issues if you're downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more information. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): + device_map (`Union[int, str, torch.device]` or `Dict[str, Union[int, str, torch.device]]`, *optional*): A map that specifies where each submodule should go. It doesn't need to be defined for each parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the same device. Defaults to `None`, meaning that the model will be loaded on CPU. + Examples: + + ```py + >>> from diffusers import AutoModel + >>> import torch + + >>> # This works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="cuda" + ... ) + >>> # This also works (integer accelerator device ID). + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map=0 + ... ) + >>> # Specifying a supported offloading strategy like "auto" also works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", device_map="auto" + ... ) + >>> # Specifying a dictionary as `device_map` also works. + >>> model = AutoModel.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", + ... subfolder="unet", + ... device_map={"": torch.device("cuda")}, + ... ) + ``` + Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + map](https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap). You + can also refer to the [Diffusers-specific + documentation](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference#model-sharding) + for more concrete examples. max_memory (`Dict`, *optional*): A dictionary device identifier for the maximum memory. Will default to the maximum memory available for each GPU and the available CPU RAM if unset. @@ -1387,7 +1416,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): low_cpu_mem_usage: bool = True, dtype: Optional[Union[str, torch.dtype]] = None, keep_in_fp32_modules: Optional[List[str]] = None, - device_map: Dict[str, Union[int, str, torch.device]] = None, + device_map: Union[str, int, torch.device, Dict[str, Union[int, str, torch.device]]] = None, offload_state_dict: Optional[bool] = None, offload_folder: Optional[Union[str, os.PathLike]] = None, dduf_entries: Optional[Dict[str, DDUFEntry]] = None, diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 0ac4251ec6..efeb085a72 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -669,14 +669,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin): Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more information. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn’t need to be defined for each - parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the - same device. - - Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For - more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + device_map (`str`, *optional*): + Strategy that dictates how the different components of a pipeline should be placed on available + devices. Currently, only "balanced" `device_map` is supported. Check out + [this](https://huggingface.co/docs/diffusers/main/en/tutorials/inference_with_big_models#device-placement) + to know more. max_memory (`Dict`, *optional*): A dictionary device identifier for the maximum memory. Will default to the maximum memory available for each GPU and the available CPU RAM if unset. diff --git a/tests/models/unets/test_models_unet_2d_condition.py b/tests/models/unets/test_models_unet_2d_condition.py index c8ed68c65b..e0331d15dd 100644 --- a/tests/models/unets/test_models_unet_2d_condition.py +++ b/tests/models/unets/test_models_unet_2d_condition.py @@ -46,6 +46,7 @@ from diffusers.utils.testing_utils import ( require_peft_backend, require_torch_accelerator, require_torch_accelerator_with_fp16, + require_torch_gpu, skip_mps, slow, torch_all_close, @@ -1083,6 +1084,42 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test assert loaded_model assert new_output.sample.shape == (4, 4, 16, 16) + @parameterized.expand( + [ + (-1, "You can't pass device_map as a negative int"), + ("foo", "When passing device_map as a string, the value needs to be a device name"), + ] + ) + def test_wrong_device_map_raises_error(self, device_map, msg_substring): + with self.assertRaises(ValueError) as err_ctx: + _ = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map + ) + + assert msg_substring in str(err_ctx.exception) + + @parameterized.expand([0, "cuda", torch.device("cuda"), torch.device("cuda:0")]) + @require_torch_gpu + def test_passing_non_dict_device_map_works(self, device_map): + _, inputs_dict = self.prepare_init_args_and_inputs_for_common() + loaded_model = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map=device_map + ) + output = loaded_model(**inputs_dict) + assert output.sample.shape == (4, 4, 16, 16) + + @parameterized.expand([("", "cuda"), ("", torch.device("cuda"))]) + @require_torch_gpu + def test_passing_dict_device_map_works(self, name, device_map): + # There are other valid dict-based `device_map` values too. It's best to refer to + # the docs for those: https://huggingface.co/docs/accelerate/en/concept_guides/big_model_inference#the-devicemap. + _, inputs_dict = self.prepare_init_args_and_inputs_for_common() + loaded_model = self.model_class.from_pretrained( + "hf-internal-testing/unet2d-sharded-dummy-subfolder", subfolder="unet", device_map={name: device_map} + ) + output = loaded_model(**inputs_dict) + assert output.sample.shape == (4, 4, 16, 16) + @require_peft_backend def test_load_attn_procs_raise_warning(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()