From d8310a8fca812ff4afdcc1fd09c135dcccaab670 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 22 Sep 2025 15:14:39 +0530 Subject: [PATCH 01/18] [lora] factor out the overlaps in `save_lora_weights()`. (#12027) * factor out the overlaps in save_lora_weights(). * remove comment. * remove comment. * up * fix-copies --- src/diffusers/loaders/lora_base.py | 35 ++ src/diffusers/loaders/lora_pipeline.py | 426 ++++++++++--------------- 2 files changed, 205 insertions(+), 256 deletions(-) diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py index d18c82df4f..0ee32f820b 100644 --- a/src/diffusers/loaders/lora_base.py +++ b/src/diffusers/loaders/lora_base.py @@ -1064,6 +1064,41 @@ class LoraBaseMixin: save_function(state_dict, save_path) logger.info(f"Model weights saved in {save_path}") + @classmethod + def _save_lora_weights( + cls, + save_directory: Union[str, os.PathLike], + lora_layers: Dict[str, Dict[str, Union[torch.nn.Module, torch.Tensor]]], + lora_metadata: Dict[str, Optional[dict]], + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, + safe_serialization: bool = True, + ): + """ + Helper method to pack and save LoRA weights and metadata. This method centralizes the saving logic for all + pipeline types. + """ + state_dict = {} + final_lora_adapter_metadata = {} + + for prefix, layers in lora_layers.items(): + state_dict.update(cls.pack_weights(layers, prefix)) + + for prefix, metadata in lora_metadata.items(): + if metadata: + final_lora_adapter_metadata.update(_pack_dict_with_prefix(metadata, prefix)) + + cls.write_lora_layers( + state_dict=state_dict, + save_directory=save_directory, + is_main_process=is_main_process, + weight_name=weight_name, + save_function=save_function, + safe_serialization=safe_serialization, + lora_adapter_metadata=final_lora_adapter_metadata if final_lora_adapter_metadata else None, + ) + @classmethod def _optionally_disable_offloading(cls, _pipeline): return _func_optionally_disable_offloading(_pipeline=_pipeline) diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py index 7e89066f1f..8060b519f1 100644 --- a/src/diffusers/loaders/lora_pipeline.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -510,35 +510,28 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin): text_encoder_lora_adapter_metadata: LoRA adapter metadata associated with the text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (unet_lora_layers or text_encoder_lora_layers): - raise ValueError("You must pass at least one of `unet_lora_layers` and `text_encoder_lora_layers`.") + lora_layers = {} + lora_metadata = {} if unet_lora_layers: - state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name)) + lora_layers[cls.unet_name] = unet_lora_layers + lora_metadata[cls.unet_name] = unet_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) + lora_layers[cls.text_encoder_name] = text_encoder_lora_layers + lora_metadata[cls.text_encoder_name] = text_encoder_lora_adapter_metadata - if unet_lora_adapter_metadata: - lora_adapter_metadata.update(_pack_dict_with_prefix(unet_lora_adapter_metadata, cls.unet_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `unet_lora_layers` or `text_encoder_lora_layers`.") - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -1004,44 +997,34 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata: LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): - raise ValueError( - "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." - ) + lora_layers = {} + lora_metadata = {} if unet_lora_layers: - state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name)) + lora_layers[cls.unet_name] = unet_lora_layers + lora_metadata[cls.unet_name] = unet_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) + lora_layers["text_encoder"] = text_encoder_lora_layers + lora_metadata["text_encoder"] = text_encoder_lora_adapter_metadata if text_encoder_2_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + lora_layers["text_encoder_2"] = text_encoder_2_lora_layers + lora_metadata["text_encoder_2"] = text_encoder_2_lora_adapter_metadata - if unet_lora_adapter_metadata is not None: - lora_adapter_metadata.update(_pack_dict_with_prefix(unet_lora_adapter_metadata, cls.unet_name)) - - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) + if not lora_layers: + raise ValueError( + "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, or `text_encoder_2_lora_layers`." ) - if text_encoder_2_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_2_lora_adapter_metadata, "text_encoder_2") - ) - - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -1467,46 +1450,34 @@ class SD3LoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata: LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (transformer_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): - raise ValueError( - "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." - ) + lora_layers = {} + lora_metadata = {} if transformer_lora_layers: - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) + lora_layers["text_encoder"] = text_encoder_lora_layers + lora_metadata["text_encoder"] = text_encoder_lora_adapter_metadata if text_encoder_2_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + lora_layers["text_encoder_2"] = text_encoder_2_lora_layers + lora_metadata["text_encoder_2"] = text_encoder_2_lora_adapter_metadata - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) + if not lora_layers: + raise ValueError( + "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, or `text_encoder_2_lora_layers`." ) - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - if text_encoder_2_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_2_lora_adapter_metadata, "text_encoder_2") - ) - - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin.fuse_lora with unet->transformer @@ -1830,28 +1801,24 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -2435,37 +2402,28 @@ class FluxLoraLoaderMixin(LoraBaseMixin): text_encoder_lora_adapter_metadata: LoRA adapter metadata associated with the text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (transformer_lora_layers or text_encoder_lora_layers): - raise ValueError("You must pass at least one of `transformer_lora_layers` and `text_encoder_lora_layers`.") + lora_layers = {} + lora_metadata = {} if transformer_lora_layers: - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) + lora_layers[cls.text_encoder_name] = text_encoder_lora_layers + lora_metadata[cls.text_encoder_name] = text_encoder_lora_adapter_metadata - if transformer_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -3254,28 +3212,24 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -3594,28 +3548,24 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -3938,28 +3888,24 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4280,28 +4226,24 @@ class SanaLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4624,28 +4566,24 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4969,28 +4907,24 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -5384,28 +5318,24 @@ class WanLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -5802,28 +5732,24 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -6144,28 +6070,24 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -6488,28 +6410,24 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -6835,28 +6753,24 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora From 579673501535753e382974e1037498cf61c3c7a1 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 22 Sep 2025 21:57:30 +0530 Subject: [PATCH 02/18] add test and doc for QwenImageEdit Plus (#12363) * up * xfail some tests * up * up --- docs/source/en/api/pipelines/qwenimage.md | 34 ++- .../qwenimage/test_qwenimage_edit_plus.py | 253 ++++++++++++++++++ 2 files changed, 286 insertions(+), 1 deletion(-) create mode 100644 tests/pipelines/qwenimage/test_qwenimage_edit_plus.py diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md index 2dec47309c..4c999bca35 100644 --- a/docs/source/en/api/pipelines/qwenimage.md +++ b/docs/source/en/api/pipelines/qwenimage.md @@ -26,6 +26,7 @@ Qwen-Image comes in the following variants: |:----------:|:--------:| | Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) | | Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) | +| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) | @@ -96,6 +97,29 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan +## Multi-image reference with QwenImageEditPlusPipeline + +With [`QwenImageEditPlusPipeline`], one can provide multiple images as input reference. + +``` +import torch +from PIL import Image +from diffusers import QwenImageEditPlusPipeline +from diffusers.utils import load_image + +pipe = QwenImageEditPlusPipeline.from_pretrained( + "Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16 +).to("cuda") + +image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/grumpy.jpg") +image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png") +image = pipe( + image=[image_1, image_2], + prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"", + num_inference_steps=50 +).images[0] +``` + ## QwenImagePipeline [[autodoc]] QwenImagePipeline @@ -126,7 +150,15 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan - all - __call__ -## QwenImaggeControlNetPipeline +## QwenImageControlNetPipeline + +[[autodoc]] QwenImageControlNetPipeline + - all + - __call__ + +## QwenImageEditPlusPipeline + +[[autodoc]] QwenImageEditPlusPipeline - all - __call__ diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py b/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py new file mode 100644 index 0000000000..6faf347282 --- /dev/null +++ b/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py @@ -0,0 +1,253 @@ +# Copyright 2025 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import pytest +import torch +from PIL import Image +from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor + +from diffusers import ( + AutoencoderKLQwenImage, + FlowMatchEulerDiscreteScheduler, + QwenImageEditPlusPipeline, + QwenImageTransformer2DModel, +) + +from ...testing_utils import enable_full_determinism, torch_device +from ..pipeline_params import TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, to_np + + +enable_full_determinism() + + +class QwenImageEditPlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = QwenImageEditPlusPipeline + params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"} + batch_params = frozenset(["prompt", "image"]) + image_params = frozenset(["image"]) + image_latents_params = frozenset(["latents"]) + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback_on_step_end", + "callback_on_step_end_tensor_inputs", + ] + ) + supports_dduf = False + test_xformers_attention = False + test_layerwise_casting = True + test_group_offloading = True + + def get_dummy_components(self): + tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration" + + torch.manual_seed(0) + transformer = QwenImageTransformer2DModel( + patch_size=2, + in_channels=16, + out_channels=4, + num_layers=2, + attention_head_dim=16, + num_attention_heads=3, + joint_attention_dim=16, + guidance_embeds=False, + axes_dims_rope=(8, 4, 4), + ) + + torch.manual_seed(0) + z_dim = 4 + vae = AutoencoderKLQwenImage( + base_dim=z_dim * 6, + z_dim=z_dim, + dim_mult=[1, 2, 4], + num_res_blocks=1, + temperal_downsample=[False, True], + latents_mean=[0.0] * z_dim, + latents_std=[1.0] * z_dim, + ) + + torch.manual_seed(0) + scheduler = FlowMatchEulerDiscreteScheduler() + + torch.manual_seed(0) + config = Qwen2_5_VLConfig( + text_config={ + "hidden_size": 16, + "intermediate_size": 16, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "rope_scaling": { + "mrope_section": [1, 1, 2], + "rope_type": "default", + "type": "default", + }, + "rope_theta": 1000000.0, + }, + vision_config={ + "depth": 2, + "hidden_size": 16, + "intermediate_size": 16, + "num_heads": 2, + "out_hidden_size": 16, + }, + hidden_size=16, + vocab_size=152064, + vision_end_token_id=151653, + vision_start_token_id=151652, + vision_token_id=151654, + ) + text_encoder = Qwen2_5_VLForConditionalGeneration(config) + tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id) + + components = { + "transformer": transformer, + "vae": vae, + "scheduler": scheduler, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "processor": Qwen2VLProcessor.from_pretrained(tiny_ckpt_id), + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = Image.new("RGB", (32, 32)) + inputs = { + "prompt": "dance monkey", + "image": [image, image], + "negative_prompt": "bad quality", + "generator": generator, + "num_inference_steps": 2, + "true_cfg_scale": 1.0, + "height": 32, + "width": 32, + "max_sequence_length": 16, + "output_type": "pt", + } + + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + generated_image = image[0] + self.assertEqual(generated_image.shape, (3, 32, 32)) + + # fmt: off + expected_slice = torch.tensor([[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]]) + # fmt: on + + generated_slice = generated_image.flatten() + generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]]) + self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3)) + + def test_attention_slicing_forward_pass( + self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3 + ): + if not self.test_attention_slicing: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + generator_device = "cpu" + inputs = self.get_dummy_inputs(generator_device) + output_without_slicing = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=1) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing1 = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=2) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing2 = pipe(**inputs)[0] + + if test_max_difference: + max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max() + max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max() + self.assertLess( + max(max_diff1, max_diff2), + expected_max_diff, + "Attention slicing should not affect the inference results", + ) + + def test_vae_tiling(self, expected_diff_max: float = 0.2): + generator_device = "cpu" + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe.to("cpu") + pipe.set_progress_bar_config(disable=None) + + # Without tiling + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_without_tiling = pipe(**inputs)[0] + + # With tiling + pipe.vae.enable_tiling( + tile_sample_min_height=96, + tile_sample_min_width=96, + tile_sample_stride_height=64, + tile_sample_stride_width=64, + ) + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_with_tiling = pipe(**inputs)[0] + + self.assertLess( + (to_np(output_without_tiling) - to_np(output_with_tiling)).max(), + expected_diff_max, + "VAE tiling should not affect the inference results", + ) + + @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True) + def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4): + super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol) + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_num_images_per_prompt(): + super().test_num_images_per_prompt() + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_inference_batch_consistent(): + super().test_inference_batch_consistent() + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_inference_batch_single_identical(): + super().test_inference_batch_single_identical() From 1448b035859dd57bbb565239dcdd79a025a85422 Mon Sep 17 00:00:00 2001 From: SahilCarterr <110806554+SahilCarterr@users.noreply.github.com> Date: Tue, 23 Sep 2025 01:34:13 +0530 Subject: [PATCH 03/18] [Fix] chroma docs (#12360) * Fixes chroma docs * fix docs fixed docs are now consistent --- src/diffusers/pipelines/chroma/pipeline_chroma.py | 10 +++++----- .../pipelines/chroma/pipeline_chroma_img2img.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py index f3ed700bc4..19ea7729c9 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py @@ -688,11 +688,11 @@ class ChromaPipeline( their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. guidance_scale (`float`, *optional*, defaults to 3.5): - Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages - a model to generate images more aligned with `prompt` at the expense of lower image quality. - - Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to - the [paper](https://huggingface.co/papers/2210.03142) to learn more. + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py index 26f13fe06c..9afd4b9e15 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py @@ -749,12 +749,12 @@ class ChromaImg2ImgPipeline( Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. - guidance_scale (`float`, *optional*, defaults to 5.0): - Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages - a model to generate images more aligned with `prompt` at the expense of lower image quality. - - Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to - the [paper](https://huggingface.co/papers/2210.03142) to learn more. + guidance_scale (`float`, *optional*, defaults to 3.5): + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. strength (`float, *optional*, defaults to 0.9): Conceptually, indicates how much to transform the reference image. Must be between 0 and 1. image will be used as a starting point, adding more noise to it the larger the strength. The number of denoising From 76810eca2bbc5ce1e47997e7d9ff690a003fa70f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:29:16 -0700 Subject: [PATCH 04/18] [docs] Schedulers (#12246) * init * toctree * scheduler suggestions * toctree --- docs/source/en/_toctree.yml | 6 +- docs/source/en/using-diffusers/models.md | 120 ------ .../en/using-diffusers/scheduler_features.md | 235 ----------- docs/source/en/using-diffusers/schedulers.md | 397 +++++++++++------- 4 files changed, 236 insertions(+), 522 deletions(-) delete mode 100644 docs/source/en/using-diffusers/models.md delete mode 100644 docs/source/en/using-diffusers/scheduler_features.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 14dbfe3ea1..856874d519 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -23,11 +23,7 @@ - local: using-diffusers/reusing_seeds title: Reproducibility - local: using-diffusers/schedulers - title: Load schedulers and models - - local: using-diffusers/models - title: Models - - local: using-diffusers/scheduler_features - title: Scheduler features + title: Schedulers - local: using-diffusers/other-formats title: Model files and layouts - local: using-diffusers/push_to_hub diff --git a/docs/source/en/using-diffusers/models.md b/docs/source/en/using-diffusers/models.md deleted file mode 100644 index 22c78d490a..0000000000 --- a/docs/source/en/using-diffusers/models.md +++ /dev/null @@ -1,120 +0,0 @@ - - -[[open-in-colab]] - -# Models - -A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs. - -This guide will show you how to load models. - -## Loading a model - -All models are loaded with the [`~ModelMixin.from_pretrained`] method, which downloads and caches the latest model version. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache. - -Pass the `subfolder` argument to [`~ModelMixin.from_pretrained`] to specify where to load the model weights from. Omit the `subfolder` argument if the repository doesn't have a subfolder structure or if you're loading a standalone model. - -```py -from diffusers import QwenImageTransformer2DModel - -model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer") -``` - -## AutoModel - -[`AutoModel`] detects the model class from a `model_index.json` file or a model's `config.json` file. It fetches the correct model class from these files and delegates the actual loading to the model class. [`AutoModel`] is useful for automatic model type detection without needing to know the exact model class beforehand. - -```py -from diffusers import AutoModel - -model = AutoModel.from_pretrained( - "Qwen/Qwen-Image", subfolder="transformer" -) -``` - -## Model data types - -Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to load a model with a specific data type. This allows you to load a model in a lower precision to reduce memory usage. - -```py -import torch -from diffusers import QwenImageTransformer2DModel - -model = QwenImageTransformer2DModel.from_pretrained( - "Qwen/Qwen-Image", - subfolder="transformer", - torch_dtype=torch.bfloat16 -) -``` - -[nn.Module.to](https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)) - -```py -from diffusers import QwenImageTransformer2DModel - -model = QwenImageTransformer2DModel.from_pretrained( - "Qwen/Qwen-Image", subfolder="transformer" -) -model = model.to(dtype=torch.float16) -``` - -## Device placement - -Use the `device_map` argument in [`~ModelMixin.from_pretrained`] to place a model on an accelerator like a GPU. It is especially helpful where there are multiple GPUs. - -Diffusers currently provides three options to `device_map` for individual models, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies. - -| parameter | description | -|---|---| -| `"cuda"` | places pipeline on a supported accelerator (CUDA) | -| `"balanced"` | evenly distributes pipeline on all GPUs | -| `"auto"` | distribute model from fastest device first to slowest | - -Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available. - -```py -import torch -from diffusers import QwenImagePipeline - -max_memory = {0: "16GB", 1: "16GB"} -pipeline = QwenImagePipeline.from_pretrained( - "Qwen/Qwen-Image", - torch_dtype=torch.bfloat16, - device_map="cuda", - max_memory=max_memory -) -``` - -The `hf_device_map` attribute allows you to access and view the `device_map`. - -```py -print(transformer.hf_device_map) -# {'': device(type='cuda')} -``` - -## Saving models - -Save a model with the [`~ModelMixin.save_pretrained`] method. - -```py -from diffusers import QwenImageTransformer2DModel - -model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer") -model.save_pretrained("./local/model") -``` - -For large models, it is helpful to use `max_shard_size` to save a model as multiple shards. A shard can be loaded faster and save memory (refer to the [parallel loading](./loading#parallel-loading) docs for more details), especially if there is more than one GPU. - -```py -model.save_pretrained("./local/model", max_shard_size="5GB") -``` diff --git a/docs/source/en/using-diffusers/scheduler_features.md b/docs/source/en/using-diffusers/scheduler_features.md deleted file mode 100644 index f7977d53d5..0000000000 --- a/docs/source/en/using-diffusers/scheduler_features.md +++ /dev/null @@ -1,235 +0,0 @@ - - -# Scheduler features - -The scheduler is an important component of any diffusion model because it controls the entire denoising (or sampling) process. There are many types of schedulers, some are optimized for speed and some for quality. With Diffusers, you can modify the scheduler configuration to use custom noise schedules, sigmas, and rescale the noise schedule. Changing these parameters can have profound effects on inference quality and speed. - -This guide will demonstrate how to use these features to improve inference quality. - -> [!TIP] -> Diffusers currently only supports the `timesteps` and `sigmas` parameters for a select list of schedulers and pipelines. Feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend these parameters to a scheduler and pipeline that does not currently support it! - -## Timestep schedules - -The timestep or noise schedule determines the amount of noise at each sampling step. The scheduler uses this to generate an image with the corresponding amount of noise at each step. The timestep schedule is generated from the scheduler's default configuration, but you can customize the scheduler to use new and optimized sampling schedules that aren't in Diffusers yet. - -For example, [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) is a method for optimizing a sampling schedule to generate a high-quality image in as little as 10 steps. The optimal [10-step schedule](https://github.com/huggingface/diffusers/blob/a7bf77fc284810483f1e60afe34d1d27ad91ce2e/src/diffusers/schedulers/scheduling_utils.py#L51) for Stable Diffusion XL is: - -```py -from diffusers.schedulers import AysSchedules - -sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"] -print(sampling_schedule) -"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]" -``` - -You can use the AYS sampling schedule in a pipeline by passing it to the `timesteps` parameter. - -```py -pipeline = StableDiffusionXLPipeline.from_pretrained( - "SG161222/RealVisXL_V4.0", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") -pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++") - -prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" -generator = torch.Generator(device="cpu").manual_seed(2487854446) -image = pipeline( - prompt=prompt, - negative_prompt="", - generator=generator, - timesteps=sampling_schedule, -).images[0] -``` - -
-
- -
AYS timestep schedule 10 steps
-
-
- -
Linearly-spaced timestep schedule 10 steps
-
-
- -
Linearly-spaced timestep schedule 25 steps
-
-
- -## Timestep spacing - -The way sample steps are selected in the schedule can affect the quality of the generated image, especially with respect to [rescaling the noise schedule](#rescale-noise-schedule), which can enable a model to generate much brighter or darker images. Diffusers provides three timestep spacing methods: - -- `leading` creates evenly spaced steps -- `linspace` includes the first and last steps and evenly selects the remaining intermediate steps -- `trailing` only includes the last step and evenly selects the remaining intermediate steps starting from the end - -It is recommended to use the `trailing` spacing method because it generates higher quality images with more details when there are fewer sample steps. But the difference in quality is not as obvious for more standard sample step values. - -```py -import torch -from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler - -pipeline = StableDiffusionXLPipeline.from_pretrained( - "SG161222/RealVisXL_V4.0", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") -pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing") - -prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night" -generator = torch.Generator(device="cpu").manual_seed(2487854446) -image = pipeline( - prompt=prompt, - negative_prompt="", - generator=generator, - num_inference_steps=5, -).images[0] -image -``` - -
-
- -
trailing spacing after 5 steps
-
-
- -
leading spacing after 5 steps
-
-
- -## Sigmas - -The `sigmas` parameter is the amount of noise added at each timestep according to the timestep schedule. Like the `timesteps` parameter, you can customize the `sigmas` parameter to control how much noise is added at each step. When you use a custom `sigmas` value, the `timesteps` are calculated from the custom `sigmas` value and the default scheduler configuration is ignored. - -For example, you can manually pass the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) for something like the 10-step AYS schedule from before to the pipeline. - -```py -import torch - -from diffusers import DiffusionPipeline, EulerDiscreteScheduler - -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") -pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config) - -sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0] -prompt = "anthropomorphic capybara wearing a suit and working with a computer" -generator = torch.Generator(device='cuda').manual_seed(123) -image = pipeline( - prompt=prompt, - num_inference_steps=10, - sigmas=sigmas, - generator=generator -).images[0] -``` - -When you take a look at the scheduler's `timesteps` parameter, you'll see that it is the same as the AYS timestep schedule because the `timestep` schedule is calculated from the `sigmas`. - -```py -print(f" timesteps: {pipe.scheduler.timesteps}") -"timesteps: tensor([999., 845., 730., 587., 443., 310., 193., 116., 53., 13.], device='cuda:0')" -``` - -### Karras sigmas - -> [!TIP] -> Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas. -> -> Karras sigmas should not be used for models that weren't trained with them. For example, the base Stable Diffusion XL model shouldn't use Karras sigmas but the [DreamShaperXL](https://hf.co/Lykon/dreamshaper-xl-1-0) model can since they are trained with Karras sigmas. - -Karras scheduler's use the timestep schedule and sigmas from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://hf.co/papers/2206.00364) paper. This scheduler variant applies a smaller amount of noise per step as it approaches the end of the sampling process compared to other schedulers, and can increase the level of details in the generated image. - -Enable Karras sigmas by setting `use_karras_sigmas=True` in the scheduler. - -```py -import torch -from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler - -pipeline = StableDiffusionXLPipeline.from_pretrained( - "SG161222/RealVisXL_V4.0", - torch_dtype=torch.float16, - variant="fp16", -).to("cuda") -pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True) - -prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" -generator = torch.Generator(device="cpu").manual_seed(2487854446) -image = pipeline( - prompt=prompt, - negative_prompt="", - generator=generator, -).images[0] -``` - -
-
- -
Karras sigmas enabled
-
-
- -
Karras sigmas disabled
-
-
- -## Rescale noise schedule - -In the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper, the authors discovered that common noise schedules allowed some signal to leak into the last timestep. This signal leakage at inference can cause models to only generate images with medium brightness. By enforcing a zero signal-to-noise ratio (SNR) for the timstep schedule and sampling from the last timestep, the model can be improved to generate very bright or dark images. - -> [!TIP] -> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts. -> -> ```bash -> --prediction_type="v_prediction" -> ``` - -For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Configure the following parameters in the [`DDIMScheduler`]: - -* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR -* `timestep_spacing="trailing"` to start sampling from the last timestep - -Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out. - -```py -from diffusers import DiffusionPipeline, DDIMScheduler - -pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True) - -pipeline.scheduler = DDIMScheduler.from_config( - pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" -) -pipeline.to("cuda") -prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed" -generator = torch.Generator(device="cpu").manual_seed(23) -image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0] -image -``` - -
-
- -
default Stable Diffusion v2-1 image
-
-
- -
image with zero SNR and trailing timestep spacing enabled
-
-
diff --git a/docs/source/en/using-diffusers/schedulers.md b/docs/source/en/using-diffusers/schedulers.md index 6d928f8037..0e236e4e3e 100644 --- a/docs/source/en/using-diffusers/schedulers.md +++ b/docs/source/en/using-diffusers/schedulers.md @@ -10,200 +10,273 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Load schedulers and models - [[open-in-colab]] -Diffusion pipelines are a collection of interchangeable schedulers and models that can be mixed and matched to tailor a pipeline to a specific use case. The scheduler encapsulates the entire denoising process such as the number of denoising steps and the algorithm for finding the denoised sample. A scheduler is not parameterized or trained so they don't take very much memory. The model is usually only concerned with the forward pass of going from a noisy input to a less noisy sample. +# Schedulers -This guide will show you how to load schedulers and models to customize a pipeline. You'll use the [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint throughout this guide, so let's load it first. +A scheduler is an algorithm that provides instructions to the denoising process such as how much noise to remove at a certain step. It takes the model prediction from step *t* and applies an update for how to compute the next sample at step *t-1*. Different schedulers produce different results; some are faster while others are more accurate. + +Diffusers supports many schedulers and allows you to modify their timestep schedules, timestep spacing, and more, to generate high-quality images in fewer steps. + +This guide will show you how to load and customize schedulers. + +## Loading schedulers + +Schedulers don't have any parameters and are defined in a configuration file. Access the `.scheduler` attribute of a pipeline to view the configuration. ```py import torch from diffusers import DiffusionPipeline pipeline = DiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True -).to("cuda") -``` - -You can see what scheduler this pipeline uses with the `pipeline.scheduler` attribute. - -```py + "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, device_map="cuda" +) pipeline.scheduler -PNDMScheduler { - "_class_name": "PNDMScheduler", - "_diffusers_version": "0.21.4", - "beta_end": 0.012, - "beta_schedule": "scaled_linear", - "beta_start": 0.00085, - "clip_sample": false, - "num_train_timesteps": 1000, - "set_alpha_to_one": false, - "skip_prk_steps": true, - "steps_offset": 1, - "timestep_spacing": "leading", - "trained_betas": null -} ``` -## Load a scheduler - -Schedulers are defined by a configuration file that can be used by a variety of schedulers. Load a scheduler with the [`SchedulerMixin.from_pretrained`] method, and specify the `subfolder` parameter to load the configuration file into the correct subfolder of the pipeline repository. - -For example, to load the [`DDIMScheduler`]: - -```py -from diffusers import DDIMScheduler, DiffusionPipeline - -ddim = DDIMScheduler.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="scheduler") -``` - -Then you can pass the newly loaded scheduler to the pipeline. - -```python -pipeline = DiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", scheduler=ddim, torch_dtype=torch.float16, use_safetensors=True -).to("cuda") -``` - -## Compare schedulers - -Schedulers have their own unique strengths and weaknesses, making it difficult to quantitatively compare which scheduler works best for a pipeline. You typically have to make a trade-off between denoising speed and denoising quality. We recommend trying out different schedulers to find one that works best for your use case. Call the `pipeline.scheduler.compatibles` attribute to see what schedulers are compatible with a pipeline. - -Let's compare the [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], and the [`DPMSolverMultistepScheduler`] on the following prompt and seed. - -```py -import torch -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True -).to("cuda") - -prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition." -generator = torch.Generator(device="cuda").manual_seed(8) -``` - -To change the pipelines scheduler, use the [`~ConfigMixin.from_config`] method to load a different scheduler's `pipeline.scheduler.config` into the pipeline. - - - - -[`LMSDiscreteScheduler`] typically generates higher quality images than the default scheduler. - -```py -from diffusers import LMSDiscreteScheduler - -pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config) -image = pipeline(prompt, generator=generator).images[0] -image -``` - - - - -[`EulerDiscreteScheduler`] can generate higher quality images in just 30 steps. - -```py -from diffusers import EulerDiscreteScheduler - -pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config) -image = pipeline(prompt, generator=generator).images[0] -image -``` - - - - -[`EulerAncestralDiscreteScheduler`] can generate higher quality images in just 30 steps. - -```py -from diffusers import EulerAncestralDiscreteScheduler - -pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config) -image = pipeline(prompt, generator=generator).images[0] -image -``` - - - - -[`DPMSolverMultistepScheduler`] provides a balance between speed and quality and can generate higher quality images in just 20 steps. +Load a different scheduler with [`~SchedulerMixin.from_pretrained`] and specify the `subfolder` argument to load the configuration file into the correct subfolder of the pipeline repository. Pass the new scheduler to the existing pipeline. ```py from diffusers import DPMSolverMultistepScheduler -pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) -image = pipeline(prompt, generator=generator).images[0] +dpm = DPMSolverMultistepScheduler.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler" +) +pipeline = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + scheduler=dpm, + torch_dtype=torch.float16, + device_map="cuda" +) +pipeline.scheduler +``` + +## Timestep schedules + +Timestep or noise schedule decides how noise is distributed over the denoising process. The schedule can be linear or more concentrated toward the beginning or end. It is a precomputed sequence of noise levels generated from the scheduler's default configuration, but it can be customized to use other schedules. + +> [!TIP] +> The `timesteps` argument is only supported for a select list of schedulers and pipelines. Feel free to open a feature request if you want to extend these parameters to a scheduler and pipeline that does not currently support it! + +The example below uses the [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) schedule which can generate a high-quality image in 10 steps, significantly speeding up generation and reducing computation time. + +Import the schedule and pass it to the `timesteps` argument in the pipeline. + +```py +import torch +from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler +from diffusers.schedulers import AysSchedules + +sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"] +print(sampling_schedule) +"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]" + +pipeline = DiffusionPipeline.from_pretrained( + "SG161222/RealVisXL_V4.0", + torch_dtype=torch.float16, + device_map="cuda" +) +pipeline.scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, algorithm_type="sde-dpmsolver++" +) + +prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" +image = pipeline( + prompt=prompt, + negative_prompt="", + timesteps=sampling_schedule, +).images[0] +``` + +
+
+ +
AYS timestep schedule 10 steps
+
+
+ +
Linearly-spaced timestep schedule 10 steps
+
+
+ +
Linearly-spaced timestep schedule 25 steps
+
+
+ +### Rescaling schedules + +Denoising should begin with pure noise and the signal-to-noise (SNR) ration should be zero. However, some models don't actually start from pure noise which makes it difficult to generate images at brightness extremes. + +> [!TIP] +> Train your own model with `v_prediction` by adding the `--prediction_type="v_prediction"` flag to your training script. You can also [search](https://huggingface.co/search/full-text?q=v_prediction&type=model) for existing models trained with `v_prediction`. + +To fix this, a model must be trained with `v_prediction`. If a model is trained with `v_prediction`, then enable the following arguments in the scheduler. + +- Set `rescale_betas_zero_snr=True` to rescale the noise schedule to the very last timestep with exactly zero SNR +- Set `timestep_spacing="trailing"` to force sampling from the last timestep with pure noise + +```py +from diffusers import DiffusionPipeline, DDIMScheduler + +pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", device_map="cuda") + +pipeline.scheduler = DDIMScheduler.from_config( + pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" +) +``` + +Set `guidance_rescale` in the pipeline to avoid overexposed images. A lower value increases brightness, but some details may appear washed out. + +```py +prompt = """ +cinematic photo of a snowy mountain at night with the northern lights aurora borealis +overhead, 35mm photograph, film, professional, 4k, highly detailed +""" +image = pipeline(prompt, guidance_rescale=0.7).images[0] +``` + +
+
+ +
default Stable Diffusion v2-1 image
+
+
+ +
image with zero SNR and trailing timestep spacing enabled
+
+
+ +## Timestep spacing + +Timestep spacing refers to the specific steps *t* to sample from from the schedule. Diffusers provides three spacing types as shown below. + +| spacing strategy | spacing calculation | example timesteps | +|---|---|---| +| `leading` | evenly spaced steps | `[900, 800, 700, ..., 100, 0]` | +| `linspace` | include first and last steps and evenly divide remaining intermediate steps | `[1000, 888.89, 777.78, ..., 111.11, 0]` | +| `trailing` | include last step and evenly divide remaining intermediate steps beginning from the end | `[999, 899, 799, 699, 599, 499, 399, 299, 199, 99]` | + +Pass the spacing strategy to the `timestep_spacing` argument in the scheduler. + +> [!TIP] +> The `trailing` strategy typically produces higher quality images with more details with fewer steps, but the difference in quality is not as obvious for more standard step values. + +```py +import torch +from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler + +pipeline = DiffusionPipeline.from_pretrained( + "SG161222/RealVisXL_V4.0", + torch_dtype=torch.float16, + device_map="cuda" +) +pipeline.scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, timestep_spacing="trailing" +) + +prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night" +image = pipeline( + prompt=prompt, + negative_prompt="", + num_inference_steps=5, +).images[0] image ``` -
-
-
- -
LMSDiscreteScheduler
+ +
trailing spacing after 5 steps
- -
EulerDiscreteScheduler
-
-
-
-
- -
EulerAncestralDiscreteScheduler
-
-
- -
DPMSolverMultistepScheduler
+ +
leading spacing after 5 steps
-Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results. +## Sigmas -## Models +Sigmas is a measure of how noisy a sample is at a certain step as defined by the schedule. When using custom `sigmas`, the `timesteps` are calculated from these values instead of the default scheduler configuration. -Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them. +> [!TIP] +> The `sigmas` argument is only supported for a select list of schedulers and pipelines. Feel free to open a feature request if you want to extend these parameters to a scheduler and pipeline that does not currently support it! -Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5) are stored in the [unet](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main/unet) subfolder. - -```python -from diffusers import UNet2DConditionModel - -unet = UNet2DConditionModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet", use_safetensors=True) -``` - -They can also be directly loaded from a [repository](https://huggingface.co/google/ddpm-cifar10-32/tree/main). - -```python -from diffusers import UNet2DModel - -unet = UNet2DModel.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True) -``` - -To load and save model variants, specify the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`]. - -```python -from diffusers import UNet2DConditionModel - -unet = UNet2DConditionModel.from_pretrained( - "stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True -) -unet.save_pretrained("./local-unet", variant="non_ema") -``` - -Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to specify the dtype to load a model in. +Pass the custom sigmas to the `sigmas` argument in the pipeline. The example below uses the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) from the 10-step AYS schedule. ```py -from diffusers import AutoModel +import torch +from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler -unet = AutoModel.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", torch_dtype=torch.float16 +pipeline = DiffusionPipeline.from_pretrained( + "SG161222/RealVisXL_V4.0", + torch_dtype=torch.float16, + device_map="cuda" ) +pipeline.scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, algorithm_type="sde-dpmsolver++" +) + +sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0] +prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" +image = pipeline( + prompt=prompt, + negative_prompt="", + sigmas=sigmas, +).images[0] ``` -You can also use the [torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) method to convert to the specified dtype on the fly. It converts *all* weights unlike the `torch_dtype` argument that respects the `_keep_in_fp32_modules`. This is important for models whose layers must remain in fp32 for numerical stability and best generation quality (see example [here](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)). +### Karras sigmas + +[Karras sigmas](https://huggingface.co/papers/2206.00364) resamples the noise schedule for more efficient sampling by clustering sigmas more densely in the middle of the sequence where structure reconstruction is critical, while using fewer sigmas at the beginning and end where noise changes have less impact. This can increase the level of details in a generated image. + +Set `use_karras_sigmas=True` in the scheduler to enable it. + +```py +import torch +from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler + +pipeline = DiffusionPipeline.from_pretrained( + "SG161222/RealVisXL_V4.0", + torch_dtype=torch.float16, + device_map="cuda" +) +pipeline.scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, + algorithm_type="sde-dpmsolver++", + use_karras_sigmas=True, +) + +prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up" +image = pipeline( + prompt=prompt, + negative_prompt="", + sigmas=sigmas, +).images[0] +``` + +
+
+ +
Karras sigmas enabled
+
+
+ +
Karras sigmas disabled
+
+
+ +Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas. It should only be used for models trained with Karras sigmas. + +## Choosing a scheduler + +It's important to try different schedulers to find the best one for your use case. Here are a few recommendations to help you get started. + +- DPM++ 2M SDE Karras is generally a good all-purpose option. +- [`TCDScheduler`] works well for distilled models. +- [`FlowMatchEulerDiscreteScheduler`] and [`FlowMatchHeunDiscreteScheduler`] for FlowMatch models. +- [`EulerDiscreteScheduler`] or [`EulerAncestralDiscreteScheduler`] for generating anime style images. +- DPM++ 2M paired with [`LCMScheduler`] on SDXL for generating realistic images. + +## Resources + +- Read the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper for more details about rescaling the noise schedule to enforce zero SNR. \ No newline at end of file From 80de641c1c7973dd83cdb9ebb0946affb28e00f1 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Tue, 23 Sep 2025 19:31:42 +0200 Subject: [PATCH 05/18] Allow Automodel to support custom model code (#12353) * update * update --- src/diffusers/models/auto_model.py | 67 +++++++++++++------- src/diffusers/utils/dynamic_modules_utils.py | 5 ++ 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/src/diffusers/models/auto_model.py b/src/diffusers/models/auto_model.py index bfe386f1f6..ada0d54e54 100644 --- a/src/diffusers/models/auto_model.py +++ b/src/diffusers/models/auto_model.py @@ -19,6 +19,7 @@ from huggingface_hub.utils import validate_hf_hub_args from ..configuration_utils import ConfigMixin from ..utils import logging +from ..utils.dynamic_modules_utils import get_class_from_dynamic_module, resolve_trust_remote_code logger = logging.get_logger(__name__) @@ -114,6 +115,8 @@ class AutoModel(ConfigMixin): disable_mmap ('bool', *optional*, defaults to 'False'): Whether to disable mmap when loading a Safetensors model. This option can perform better when the model is on a network mount or hard drive, which may not handle the seeky-ness of mmap very well. + trust_remote_cocde (`bool`, *optional*, defaults to `False`): + Whether to trust remote code @@ -140,22 +143,22 @@ class AutoModel(ConfigMixin): You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. ``` """ - cache_dir = kwargs.pop("cache_dir", None) - force_download = kwargs.pop("force_download", False) - proxies = kwargs.pop("proxies", None) - token = kwargs.pop("token", None) - local_files_only = kwargs.pop("local_files_only", False) - revision = kwargs.pop("revision", None) subfolder = kwargs.pop("subfolder", None) + trust_remote_code = kwargs.pop("trust_remote_code", False) - load_config_kwargs = { - "cache_dir": cache_dir, - "force_download": force_download, - "proxies": proxies, - "token": token, - "local_files_only": local_files_only, - "revision": revision, - } + hub_kwargs_names = [ + "cache_dir", + "force_download", + "local_files_only", + "proxies", + "resume_download", + "revision", + "token", + ] + hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names} + + # load_config_kwargs uses the same hub kwargs minus subfolder and resume_download + load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder", "resume_download"]} library = None orig_class_name = None @@ -189,15 +192,35 @@ class AutoModel(ConfigMixin): else: raise ValueError(f"Couldn't find model associated with the config file at {pretrained_model_or_path}.") - from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates + has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"] + trust_remote_code = resolve_trust_remote_code(trust_remote_code, pretrained_model_or_path, has_remote_code) + if not (has_remote_code and trust_remote_code): + raise ValueError( + "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file." + ) - model_cls, _ = get_class_obj_and_candidates( - library_name=library, - class_name=orig_class_name, - importable_classes=ALL_IMPORTABLE_CLASSES, - pipelines=None, - is_pipeline_module=False, - ) + if has_remote_code and trust_remote_code: + class_ref = config["auto_map"][cls.__name__] + module_file, class_name = class_ref.split(".") + module_file = module_file + ".py" + model_cls = get_class_from_dynamic_module( + pretrained_model_or_path, + subfolder=subfolder, + module_file=module_file, + class_name=class_name, + **hub_kwargs, + **kwargs, + ) + else: + from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates + + model_cls, _ = get_class_obj_and_candidates( + library_name=library, + class_name=orig_class_name, + importable_classes=ALL_IMPORTABLE_CLASSES, + pipelines=None, + is_pipeline_module=False, + ) if model_cls is None: raise ValueError(f"AutoModel can't find a model linked to {orig_class_name}.") diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index 674eb65773..de947a12e2 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -247,6 +247,7 @@ def find_pipeline_class(loaded_module): def get_cached_module_file( pretrained_model_name_or_path: Union[str, os.PathLike], module_file: str, + subfolder: Optional[str] = None, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, proxies: Optional[Dict[str, str]] = None, @@ -353,6 +354,7 @@ def get_cached_module_file( resolved_module_file = hf_hub_download( pretrained_model_name_or_path, module_file, + subfolder=subfolder, cache_dir=cache_dir, force_download=force_download, proxies=proxies, @@ -410,6 +412,7 @@ def get_cached_module_file( get_cached_module_file( pretrained_model_name_or_path, f"{module_needed}.py", + subfolder=subfolder, cache_dir=cache_dir, force_download=force_download, proxies=proxies, @@ -424,6 +427,7 @@ def get_cached_module_file( def get_class_from_dynamic_module( pretrained_model_name_or_path: Union[str, os.PathLike], module_file: str, + subfolder: Optional[str] = None, class_name: Optional[str] = None, cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, @@ -497,6 +501,7 @@ def get_class_from_dynamic_module( final_module = get_cached_module_file( pretrained_model_name_or_path, module_file, + subfolder=subfolder, cache_dir=cache_dir, force_download=force_download, proxies=proxies, From a72bc0c4bb817d382a59b38bba8d9a71661f56cb Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 23 Sep 2025 10:59:46 -0700 Subject: [PATCH 06/18] [docs] Attention backends (#12320) * init * feedback * update * feedback * fixes --- docs/source/en/_toctree.yml | 2 + .../en/optimization/attention_backends.md | 106 ++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 docs/source/en/optimization/attention_backends.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 856874d519..4879a7bf04 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -64,6 +64,8 @@ title: Accelerate inference - local: optimization/cache title: Caching + - local: optimization/attention_backends + title: Attention backends - local: optimization/memory title: Reduce memory usage - local: optimization/speed-memory-optims diff --git a/docs/source/en/optimization/attention_backends.md b/docs/source/en/optimization/attention_backends.md new file mode 100644 index 0000000000..04c8b4ba92 --- /dev/null +++ b/docs/source/en/optimization/attention_backends.md @@ -0,0 +1,106 @@ + + +# Attention backends + +> [!TIP] +> The attention dispatcher is an experimental feature. Please open an issue if you have any feedback or encounter any problems. + +Diffusers provides several optimized attention algorithms that are more memory and computationally efficient through it's *attention dispatcher*. The dispatcher acts as a router for managing and switching between different attention implementations and provides a unified interface for interacting with them. + +Refer to the table below for an overview of the available attention families and to the [Available backends](#available-backends) section for a more complete list. + +| attention family | main feature | +|---|---| +| FlashAttention | minimizes memory reads/writes through tiling and recomputation | +| SageAttention | quantizes attention to int8 | +| PyTorch native | built-in PyTorch implementation using [scaled_dot_product_attention](./fp16#scaled-dot-product-attention) | +| xFormers | memory-efficient attention with support for various attention kernels | + +This guide will show you how to set and use the different attention backends. + +## set_attention_backend + +The [`~ModelMixin.set_attention_backend`] method iterates through all the modules in the model and sets the appropriate attention backend to use. The attention backend setting persists until [`~ModelMixin.reset_attention_backend`] is called. + +The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup. + +> [!TIP] +> FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`. + +```py +import torch +from diffusers import QwenImagePipeline + +pipeline = QwenImagePipeline.from_pretrained( + "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda" +) +pipeline.transformer.set_attention_backend("_flash_3_hub") + +prompt = """ +cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California +highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain +""" +pipeline(prompt).images[0] +``` + +To restore the default attention backend, call [`~ModelMixin.reset_attention_backend`]. + +```py +pipeline.transformer.reset_attention_backend() +``` + +## attention_backend context manager + +The [attention_backend](https://github.com/huggingface/diffusers/blob/5e181eddfe7e44c1444a2511b0d8e21d177850a0/src/diffusers/models/attention_dispatch.py#L225) context manager temporarily sets an attention backend for a model within the context. Outside the context, the default attention (PyTorch's native scaled dot product attention) is used. This is useful if you want to use different backends for different parts of a pipeline or if you want to test the different backends. + +```py +import torch +from diffusers import QwenImagePipeline + +pipeline = QwenImagePipeline.from_pretrained( + "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda" +) +prompt = """ +cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California +highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain +""" + +with attention_backend("_flash_3_hub"): + image = pipeline(prompt).images[0] +``` + +## Available backends + +Refer to the table below for a complete list of available attention backends and their variants. + +| Backend Name | Family | Description | +|--------------|--------|-------------| +| `native` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Default backend using PyTorch's scaled_dot_product_attention | +| `flex` | [FlexAttention](https://docs.pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention) | PyTorch FlexAttention implementation | +| `_native_cudnn` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | CuDNN-optimized attention | +| `_native_efficient` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Memory-efficient attention | +| `_native_flash` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | PyTorch's FlashAttention | +| `_native_math` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Math-based attention (fallback) | +| `_native_npu` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | NPU-optimized attention | +| `_native_xla` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | XLA-optimized attention | +| `flash` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-2 | +| `flash_varlen` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention | +| `_flash_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 | +| `_flash_varlen_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention-3 | +| `_flash_3_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 from kernels | +| `sage` | [SageAttention](https://github.com/thu-ml/SageAttention) | Quantized attention (INT8 QK) | +| `sage_varlen` | [SageAttention](https://github.com/thu-ml/SageAttention) | Variable length SageAttention | +| `_sage_qk_int8_pv_fp8_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (CUDA) | +| `_sage_qk_int8_pv_fp8_cuda_sm90` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (SM90) | +| `_sage_qk_int8_pv_fp16_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (CUDA) | +| `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) | +| `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention | From 09e777a3e13cf811e35da57abfe6ce239d9b0f15 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 24 Sep 2025 08:36:50 +0530 Subject: [PATCH 07/18] [tests] Single scheduler in lora tests (#12315) * single scheduler please. * up * up * up --- tests/lora/test_lora_layers_auraflow.py | 1 - tests/lora/test_lora_layers_cogvideox.py | 2 - tests/lora/test_lora_layers_cogview4.py | 36 +- tests/lora/test_lora_layers_flux.py | 6 +- tests/lora/test_lora_layers_hunyuanvideo.py | 1 - tests/lora/test_lora_layers_ltx_video.py | 1 - tests/lora/test_lora_layers_lumina2.py | 46 +- tests/lora/test_lora_layers_mochi.py | 1 - tests/lora/test_lora_layers_qwenimage.py | 1 - tests/lora/test_lora_layers_sana.py | 5 +- tests/lora/test_lora_layers_sd3.py | 1 - tests/lora/test_lora_layers_wan.py | 1 - tests/lora/test_lora_layers_wanvace.py | 4 +- tests/lora/utils.py | 2083 +++++++++---------- 14 files changed, 1045 insertions(+), 1144 deletions(-) diff --git a/tests/lora/test_lora_layers_auraflow.py b/tests/lora/test_lora_layers_auraflow.py index 67084dd6d0..91f63c4b56 100644 --- a/tests/lora/test_lora_layers_auraflow.py +++ b/tests/lora/test_lora_layers_auraflow.py @@ -43,7 +43,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class AuraFlowLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = AuraFlowPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_cogvideox.py b/tests/lora/test_lora_layers_cogvideox.py index 16147f35c7..fa57b4c9c2 100644 --- a/tests/lora/test_lora_layers_cogvideox.py +++ b/tests/lora/test_lora_layers_cogvideox.py @@ -21,7 +21,6 @@ from transformers import AutoTokenizer, T5EncoderModel from diffusers import ( AutoencoderKLCogVideoX, - CogVideoXDDIMScheduler, CogVideoXDPMScheduler, CogVideoXPipeline, CogVideoXTransformer3DModel, @@ -44,7 +43,6 @@ class CogVideoXLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = CogVideoXPipeline scheduler_cls = CogVideoXDPMScheduler scheduler_kwargs = {"timestep_spacing": "trailing"} - scheduler_classes = [CogVideoXDDIMScheduler, CogVideoXDPMScheduler] transformer_kwargs = { "num_attention_heads": 4, diff --git a/tests/lora/test_lora_layers_cogview4.py b/tests/lora/test_lora_layers_cogview4.py index 3b8a56c403..9c62d2f0b8 100644 --- a/tests/lora/test_lora_layers_cogview4.py +++ b/tests/lora/test_lora_layers_cogview4.py @@ -50,7 +50,6 @@ class TokenizerWrapper: class CogView4LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = CogView4Pipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { @@ -124,30 +123,29 @@ class CogView4LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): """ Tests a simple usecase where users could use saving utilities for LoRA through save_pretrained """ - for scheduler_cls in self.scheduler_classes: - components, _, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, _, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) + with tempfile.TemporaryDirectory() as tmpdirname: + pipe.save_pretrained(tmpdirname) - pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname) - pipe_from_pretrained.to(torch_device) + pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname) + pipe_from_pretrained.to(torch_device) - images_lora_save_pretrained = pipe_from_pretrained(**inputs, generator=torch.manual_seed(0))[0] + images_lora_save_pretrained = pipe_from_pretrained(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(images_lora, images_lora_save_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", - ) + self.assertTrue( + np.allclose(images_lora, images_lora_save_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results.", + ) @parameterized.expand([("block_level", True), ("leaf_level", False)]) @require_torch_accelerator diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py index e6048f509f..6c22a34889 100644 --- a/tests/lora/test_lora_layers_flux.py +++ b/tests/lora/test_lora_layers_flux.py @@ -55,9 +55,8 @@ from .utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set # noqa @require_peft_backend class FluxLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = FluxPipeline - scheduler_cls = FlowMatchEulerDiscreteScheduler() + scheduler_cls = FlowMatchEulerDiscreteScheduler scheduler_kwargs = {} - scheduler_classes = [FlowMatchEulerDiscreteScheduler] transformer_kwargs = { "patch_size": 1, "in_channels": 4, @@ -282,9 +281,8 @@ class FluxLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = FluxControlPipeline - scheduler_cls = FlowMatchEulerDiscreteScheduler() + scheduler_cls = FlowMatchEulerDiscreteScheduler scheduler_kwargs = {} - scheduler_classes = [FlowMatchEulerDiscreteScheduler] transformer_kwargs = { "patch_size": 1, "in_channels": 8, diff --git a/tests/lora/test_lora_layers_hunyuanvideo.py b/tests/lora/test_lora_layers_hunyuanvideo.py index 62d045f836..7ea0f1fcc9 100644 --- a/tests/lora/test_lora_layers_hunyuanvideo.py +++ b/tests/lora/test_lora_layers_hunyuanvideo.py @@ -51,7 +51,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class HunyuanVideoLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = HunyuanVideoPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_ltx_video.py b/tests/lora/test_lora_layers_ltx_video.py index a8ad30e448..6ab51a5e51 100644 --- a/tests/lora/test_lora_layers_ltx_video.py +++ b/tests/lora/test_lora_layers_ltx_video.py @@ -37,7 +37,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class LTXVideoLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = LTXPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_lumina2.py b/tests/lora/test_lora_layers_lumina2.py index 0ebc831b11..0417b05b33 100644 --- a/tests/lora/test_lora_layers_lumina2.py +++ b/tests/lora/test_lora_layers_lumina2.py @@ -39,7 +39,6 @@ from .utils import PeftLoraLoaderMixinTests, check_if_lora_correctly_set # noqa class Lumina2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = Lumina2Pipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { @@ -141,33 +140,30 @@ class Lumina2LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): strict=False, ) def test_lora_fuse_nan(self): - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" - ) + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - # corrupt one LoRA weight with `inf` values - with torch.no_grad(): - pipe.transformer.layers[0].attn.to_q.lora_A["adapter-1"].weight += float("inf") + # corrupt one LoRA weight with `inf` values + with torch.no_grad(): + pipe.transformer.layers[0].attn.to_q.lora_A["adapter-1"].weight += float("inf") - # with `safe_fusing=True` we should see an Error - with self.assertRaises(ValueError): - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True) + # with `safe_fusing=True` we should see an Error + with self.assertRaises(ValueError): + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True) - # without we should not see an error, but every image will be black - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False) - out = pipe(**inputs)[0] + # without we should not see an error, but every image will be black + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False) + out = pipe(**inputs)[0] - self.assertTrue(np.isnan(out).all()) + self.assertTrue(np.isnan(out).all()) diff --git a/tests/lora/test_lora_layers_mochi.py b/tests/lora/test_lora_layers_mochi.py index 21cc5f11a3..7be81273db 100644 --- a/tests/lora/test_lora_layers_mochi.py +++ b/tests/lora/test_lora_layers_mochi.py @@ -37,7 +37,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class MochiLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = MochiPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_qwenimage.py b/tests/lora/test_lora_layers_qwenimage.py index 44ef9b0a37..51de2f8e20 100644 --- a/tests/lora/test_lora_layers_qwenimage.py +++ b/tests/lora/test_lora_layers_qwenimage.py @@ -37,7 +37,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class QwenImageLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = QwenImagePipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_sana.py b/tests/lora/test_lora_layers_sana.py index a08908c610..3cdb28de75 100644 --- a/tests/lora/test_lora_layers_sana.py +++ b/tests/lora/test_lora_layers_sana.py @@ -31,9 +31,8 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 @require_peft_backend class SanaLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = SanaPipeline - scheduler_cls = FlowMatchEulerDiscreteScheduler(shift=7.0) - scheduler_kwargs = {} - scheduler_classes = [FlowMatchEulerDiscreteScheduler] + scheduler_cls = FlowMatchEulerDiscreteScheduler + scheduler_kwargs = {"shift": 7.0} transformer_kwargs = { "patch_size": 1, "in_channels": 4, diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 95f6f325e4..228460eaad 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -55,7 +55,6 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = StableDiffusion3Pipeline scheduler_cls = FlowMatchEulerDiscreteScheduler scheduler_kwargs = {} - scheduler_classes = [FlowMatchEulerDiscreteScheduler] transformer_kwargs = { "sample_size": 32, "patch_size": 1, diff --git a/tests/lora/test_lora_layers_wan.py b/tests/lora/test_lora_layers_wan.py index 0ba80d2be1..5734509b41 100644 --- a/tests/lora/test_lora_layers_wan.py +++ b/tests/lora/test_lora_layers_wan.py @@ -42,7 +42,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class WanLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = WanPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { diff --git a/tests/lora/test_lora_layers_wanvace.py b/tests/lora/test_lora_layers_wanvace.py index d8dde32dd8..c3244e150e 100644 --- a/tests/lora/test_lora_layers_wanvace.py +++ b/tests/lora/test_lora_layers_wanvace.py @@ -50,7 +50,6 @@ from .utils import PeftLoraLoaderMixinTests # noqa: E402 class WanVACELoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pipeline_class = WanVACEPipeline scheduler_cls = FlowMatchEulerDiscreteScheduler - scheduler_classes = [FlowMatchEulerDiscreteScheduler] scheduler_kwargs = {} transformer_kwargs = { @@ -165,9 +164,8 @@ class WanVACELoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): @require_peft_version_greater("0.13.2") def test_lora_exclude_modules_wanvace(self): - scheduler_cls = self.scheduler_classes[0] exclude_module_name = "vace_blocks.0.proj_out" - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components).to(torch_device) _, _, inputs = self.get_dummy_inputs(with_generator=False) diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 72c1dddaa2..ecaa553ce4 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -26,8 +26,6 @@ from parameterized import parameterized from diffusers import ( AutoencoderKL, - DDIMScheduler, - LCMScheduler, UNet2DConditionModel, ) from diffusers.utils import logging @@ -109,7 +107,6 @@ class PeftLoraLoaderMixinTests: scheduler_cls = None scheduler_kwargs = None - scheduler_classes = [DDIMScheduler, LCMScheduler] has_two_text_encoders = False has_three_text_encoders = False @@ -129,13 +126,13 @@ class PeftLoraLoaderMixinTests: text_encoder_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"] denoiser_target_modules = ["to_q", "to_k", "to_v", "to_out.0"] - def get_dummy_components(self, scheduler_cls=None, use_dora=False, lora_alpha=None): + def get_dummy_components(self, use_dora=False, lora_alpha=None): if self.unet_kwargs and self.transformer_kwargs: raise ValueError("Both `unet_kwargs` and `transformer_kwargs` cannot be specified.") if self.has_two_text_encoders and self.has_three_text_encoders: raise ValueError("Both `has_two_text_encoders` and `has_three_text_encoders` cannot be True.") - scheduler_cls = self.scheduler_cls if scheduler_cls is None else scheduler_cls + scheduler_cls = self.scheduler_cls rank = 4 lora_alpha = rank if lora_alpha is None else lora_alpha @@ -319,152 +316,143 @@ class PeftLoraLoaderMixinTests: """ Tests a simple inference and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs() - output_no_lora = pipe(**inputs)[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + _, _, inputs = self.get_dummy_inputs() + output_no_lora = pipe(**inputs)[0] + self.assertTrue(output_no_lora.shape == self.output_shape) def test_simple_inference_with_text_lora(self): """ Tests a simple inference with lora attached on the text encoder and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" - ) + output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" + ) @require_peft_version_greater("0.13.1") def test_low_cpu_mem_usage_with_injection(self): """Tests if we can inject LoRA state dict with low_cpu_mem_usage.""" - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - inject_adapter_in_model(text_lora_config, pipe.text_encoder, low_cpu_mem_usage=True) + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + inject_adapter_in_model(text_lora_config, pipe.text_encoder, low_cpu_mem_usage=True) + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder.") + self.assertTrue( + "meta" in {p.device.type for p in pipe.text_encoder.parameters()}, + "The LoRA params should be on 'meta' device.", + ) + + te_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(pipe.text_encoder)) + set_peft_model_state_dict(pipe.text_encoder, te_state_dict, low_cpu_mem_usage=True) + self.assertTrue( + "meta" not in {p.device.type for p in pipe.text_encoder.parameters()}, + "No param should be on 'meta' device.", + ) + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + inject_adapter_in_model(denoiser_lora_config, denoiser, low_cpu_mem_usage=True) + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + self.assertTrue( + "meta" in {p.device.type for p in denoiser.parameters()}, "The LoRA params should be on 'meta' device." + ) + + denoiser_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(denoiser)) + set_peft_model_state_dict(denoiser, denoiser_state_dict, low_cpu_mem_usage=True) + self.assertTrue( + "meta" not in {p.device.type for p in denoiser.parameters()}, "No param should be on 'meta' device." + ) + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + inject_adapter_in_model(text_lora_config, pipe.text_encoder_2, low_cpu_mem_usage=True) self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder." + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) self.assertTrue( - "meta" in {p.device.type for p in pipe.text_encoder.parameters()}, + "meta" in {p.device.type for p in pipe.text_encoder_2.parameters()}, "The LoRA params should be on 'meta' device.", ) - te_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(pipe.text_encoder)) - set_peft_model_state_dict(pipe.text_encoder, te_state_dict, low_cpu_mem_usage=True) + te2_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(pipe.text_encoder_2)) + set_peft_model_state_dict(pipe.text_encoder_2, te2_state_dict, low_cpu_mem_usage=True) self.assertTrue( - "meta" not in {p.device.type for p in pipe.text_encoder.parameters()}, + "meta" not in {p.device.type for p in pipe.text_encoder_2.parameters()}, "No param should be on 'meta' device.", ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - inject_adapter_in_model(denoiser_lora_config, denoiser, low_cpu_mem_usage=True) - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - self.assertTrue( - "meta" in {p.device.type for p in denoiser.parameters()}, "The LoRA params should be on 'meta' device." - ) - - denoiser_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(denoiser)) - set_peft_model_state_dict(denoiser, denoiser_state_dict, low_cpu_mem_usage=True) - self.assertTrue( - "meta" not in {p.device.type for p in denoiser.parameters()}, "No param should be on 'meta' device." - ) - - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - inject_adapter_in_model(text_lora_config, pipe.text_encoder_2, low_cpu_mem_usage=True) - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) - self.assertTrue( - "meta" in {p.device.type for p in pipe.text_encoder_2.parameters()}, - "The LoRA params should be on 'meta' device.", - ) - - te2_state_dict = initialize_dummy_state_dict(get_peft_model_state_dict(pipe.text_encoder_2)) - set_peft_model_state_dict(pipe.text_encoder_2, te2_state_dict, low_cpu_mem_usage=True) - self.assertTrue( - "meta" not in {p.device.type for p in pipe.text_encoder_2.parameters()}, - "No param should be on 'meta' device.", - ) - - _, _, inputs = self.get_dummy_inputs() - output_lora = pipe(**inputs)[0] - self.assertTrue(output_lora.shape == self.output_shape) + _, _, inputs = self.get_dummy_inputs() + output_lora = pipe(**inputs)[0] + self.assertTrue(output_lora.shape == self.output_shape) @require_peft_version_greater("0.13.1") @require_transformers_version_greater("4.45.2") def test_low_cpu_mem_usage_with_loading(self): """Tests if we can load LoRA state dict with low_cpu_mem_usage.""" + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + with tempfile.TemporaryDirectory() as tmpdirname: + modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) + lora_state_dicts = self._get_lora_state_dicts(modules_to_save) + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts + ) - with tempfile.TemporaryDirectory() as tmpdirname: - modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) - lora_state_dicts = self._get_lora_state_dicts(modules_to_save) - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts - ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + pipe.unload_lora_weights() + pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"), low_cpu_mem_usage=False) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - pipe.unload_lora_weights() - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"), low_cpu_mem_usage=False) + for module_name, module in modules_to_save.items(): + self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - for module_name, module in modules_to_save.items(): - self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") + images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results.", + ) - images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", - ) + # Now, check for `low_cpu_mem_usage.` + pipe.unload_lora_weights() + pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"), low_cpu_mem_usage=True) - # Now, check for `low_cpu_mem_usage.` - pipe.unload_lora_weights() - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin"), low_cpu_mem_usage=True) + for module_name, module in modules_to_save.items(): + self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - for module_name, module in modules_to_save.items(): - self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - - images_lora_from_pretrained_low_cpu = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose( - images_lora_from_pretrained_low_cpu, images_lora_from_pretrained, atol=1e-3, rtol=1e-3 - ), - "Loading from saved checkpoints with `low_cpu_mem_usage` should give same results.", - ) + images_lora_from_pretrained_low_cpu = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(images_lora_from_pretrained_low_cpu, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints with `low_cpu_mem_usage` should give same results.", + ) def test_simple_inference_with_text_lora_and_scale(self): """ @@ -472,411 +460,393 @@ class PeftLoraLoaderMixinTests: and makes sure it works as expected """ attention_kwargs_name = determine_attention_kwargs_name(self.pipeline_class) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" + ) - output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" - ) + attention_kwargs = {attention_kwargs_name: {"scale": 0.5}} + output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - attention_kwargs = {attention_kwargs_name: {"scale": 0.5}} - output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] + self.assertTrue( + not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), + "Lora + scale should change the output", + ) - self.assertTrue( - not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) + attention_kwargs = {attention_kwargs_name: {"scale": 0.0}} + output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - attention_kwargs = {attention_kwargs_name: {"scale": 0.0}} - output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - - self.assertTrue( - np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), - "Lora + 0 scale should lead to same result as no LoRA", - ) + self.assertTrue( + np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), + "Lora + 0 scale should lead to same result as no LoRA", + ) def test_simple_inference_with_text_lora_fused(self): """ Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - pipe.fuse_lora() - # Fusing should still keep the LoRA layers - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + pipe.fuse_lora() + # Fusing should still keep the LoRA layers + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + self.assertTrue( + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" + ) - ouput_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) + ouput_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(ouput_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" + ) def test_simple_inference_with_text_lora_unloaded(self): """ Tests a simple inference with lora attached to text encoder, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - pipe.unload_lora_weights() - # unloading should remove the LoRA layers - self.assertFalse( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder" - ) + pipe.unload_lora_weights() + # unloading should remove the LoRA layers + self.assertFalse(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder") - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertFalse( - check_if_lora_correctly_set(pipe.text_encoder_2), - "Lora not correctly unloaded in text encoder 2", - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + self.assertFalse( + check_if_lora_correctly_set(pipe.text_encoder_2), + "Lora not correctly unloaded in text encoder 2", + ) - ouput_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(ouput_unloaded, output_no_lora, atol=1e-3, rtol=1e-3), - "Fused lora should change the output", - ) + ouput_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(ouput_unloaded, output_no_lora, atol=1e-3, rtol=1e-3), + "Fused lora should change the output", + ) def test_simple_inference_with_text_lora_save_load(self): """ Tests a simple usecase where users could use saving utilities for LoRA. """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - with tempfile.TemporaryDirectory() as tmpdirname: - modules_to_save = self._get_modules_to_save(pipe) - lora_state_dicts = self._get_lora_state_dicts(modules_to_save) + with tempfile.TemporaryDirectory() as tmpdirname: + modules_to_save = self._get_modules_to_save(pipe) + lora_state_dicts = self._get_lora_state_dicts(modules_to_save) - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts - ) - - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - pipe.unload_lora_weights() - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin")) - - for module_name, module in modules_to_save.items(): - self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - - images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + pipe.unload_lora_weights() + pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin")) + + for module_name, module in modules_to_save.items(): + self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") + + images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] + + self.assertTrue( + np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results.", + ) + def test_simple_inference_with_partial_text_lora(self): """ Tests a simple inference with lora attached on the text encoder with different ranks and some adapters removed and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, _, _ = self.get_dummy_components(scheduler_cls) - # Verify `StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder` handles different ranks per module (PR#8324). - text_lora_config = LoraConfig( - r=4, - rank_pattern={self.text_encoder_target_modules[i]: i + 1 for i in range(3)}, - lora_alpha=4, - target_modules=self.text_encoder_target_modules, - init_lora_weights=False, - use_dora=False, - ) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, _, _ = self.get_dummy_components() + # Verify `StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder` handles different ranks per module (PR#8324). + text_lora_config = LoraConfig( + r=4, + rank_pattern={self.text_encoder_target_modules[i]: i + 1 for i in range(3)}, + lora_alpha=4, + target_modules=self.text_encoder_target_modules, + init_lora_weights=False, + use_dora=False, + ) + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - state_dict = {} - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - # Gather the state dict for the PEFT model, excluding `layers.4`, to ensure `load_lora_into_text_encoder` - # supports missing layers (PR#8324). - state_dict = { - f"text_encoder.{module_name}": param - for module_name, param in get_peft_model_state_dict(pipe.text_encoder).items() - if "text_model.encoder.layers.4" not in module_name - } + state_dict = {} + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + # Gather the state dict for the PEFT model, excluding `layers.4`, to ensure `load_lora_into_text_encoder` + # supports missing layers (PR#8324). + state_dict = { + f"text_encoder.{module_name}": param + for module_name, param in get_peft_model_state_dict(pipe.text_encoder).items() + if "text_model.encoder.layers.4" not in module_name + } - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - state_dict.update( - { - f"text_encoder_2.{module_name}": param - for module_name, param in get_peft_model_state_dict(pipe.text_encoder_2).items() - if "text_model.encoder.layers.4" not in module_name - } - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + state_dict.update( + { + f"text_encoder_2.{module_name}": param + for module_name, param in get_peft_model_state_dict(pipe.text_encoder_2).items() + if "text_model.encoder.layers.4" not in module_name + } + ) - output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" - ) + output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" + ) - # Unload lora and load it back using the pipe.load_lora_weights machinery - pipe.unload_lora_weights() - pipe.load_lora_weights(state_dict) + # Unload lora and load it back using the pipe.load_lora_weights machinery + pipe.unload_lora_weights() + pipe.load_lora_weights(state_dict) - output_partial_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - not np.allclose(output_partial_lora, output_lora, atol=1e-3, rtol=1e-3), - "Removing adapters should change the output", - ) + output_partial_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_partial_lora, output_lora, atol=1e-3, rtol=1e-3), + "Removing adapters should change the output", + ) def test_simple_inference_save_pretrained_with_text_lora(self): """ Tests a simple usecase where users could use saving utilities for LoRA through save_pretrained """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) - images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config=None) + images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) + with tempfile.TemporaryDirectory() as tmpdirname: + pipe.save_pretrained(tmpdirname) - pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname) - pipe_from_pretrained.to(torch_device) + pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname) + pipe_from_pretrained.to(torch_device) - if "text_encoder" in self.pipeline_class._lora_loadable_modules: + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + self.assertTrue( + check_if_lora_correctly_set(pipe_from_pretrained.text_encoder), + "Lora not correctly set in text encoder", + ) + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: self.assertTrue( - check_if_lora_correctly_set(pipe_from_pretrained.text_encoder), - "Lora not correctly set in text encoder", + check_if_lora_correctly_set(pipe_from_pretrained.text_encoder_2), + "Lora not correctly set in text encoder 2", ) - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - check_if_lora_correctly_set(pipe_from_pretrained.text_encoder_2), - "Lora not correctly set in text encoder 2", - ) + images_lora_save_pretrained = pipe_from_pretrained(**inputs, generator=torch.manual_seed(0))[0] - images_lora_save_pretrained = pipe_from_pretrained(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(images_lora, images_lora_save_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", - ) + self.assertTrue( + np.allclose(images_lora, images_lora_save_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results.", + ) def test_simple_inference_with_text_denoiser_lora_save_load(self): """ Tests a simple usecase where users could use saving utilities for LoRA for Unet + text encoder """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - with tempfile.TemporaryDirectory() as tmpdirname: - modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) - lora_state_dicts = self._get_lora_state_dicts(modules_to_save) - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts - ) - - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - pipe.unload_lora_weights() - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin")) - - for module_name, module in modules_to_save.items(): - self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - - images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results.", + with tempfile.TemporaryDirectory() as tmpdirname: + modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) + lora_state_dicts = self._get_lora_state_dicts(modules_to_save) + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, safe_serialization=False, **lora_state_dicts ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + pipe.unload_lora_weights() + pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.bin")) + + for module_name, module in modules_to_save.items(): + self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") + + images_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(images_lora, images_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results.", + ) + def test_simple_inference_with_text_denoiser_lora_and_scale(self): """ Tests a simple inference with lora attached on the text encoder + Unet + scale argument and makes sure it works as expected """ attention_kwargs_name = determine_attention_kwargs_name(self.pipeline_class) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" + ) - output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + attention_kwargs = {attention_kwargs_name: {"scale": 0.5}} + output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] + + self.assertTrue( + not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), + "Lora + scale should change the output", + ) + + attention_kwargs = {attention_kwargs_name: {"scale": 0.0}} + output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] + + self.assertTrue( + np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), + "Lora + 0 scale should lead to same result as no LoRA", + ) + + if "text_encoder" in self.pipeline_class._lora_loadable_modules: self.assertTrue( - not np.allclose(output_lora, output_no_lora, atol=1e-3, rtol=1e-3), "Lora should change the output" + pipe.text_encoder.text_model.encoder.layers[0].self_attn.q_proj.scaling["default"] == 1.0, + "The scaling parameter has not been correctly restored!", ) - attention_kwargs = {attention_kwargs_name: {"scale": 0.5}} - output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - - self.assertTrue( - not np.allclose(output_lora, output_lora_scale, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) - - attention_kwargs = {attention_kwargs_name: {"scale": 0.0}} - output_lora_0_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - - self.assertTrue( - np.allclose(output_no_lora, output_lora_0_scale, atol=1e-3, rtol=1e-3), - "Lora + 0 scale should lead to same result as no LoRA", - ) - - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - pipe.text_encoder.text_model.encoder.layers[0].self_attn.q_proj.scaling["default"] == 1.0, - "The scaling parameter has not been correctly restored!", - ) - def test_simple_inference_with_text_lora_denoiser_fused(self): """ Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected - with unet """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules) + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules) - # Fusing should still keep the LoRA layers - if "text_encoder" in self.pipeline_class._lora_loadable_modules: + # Fusing should still keep the LoRA layers + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser") + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser") - - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) - - output_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" - ) + output_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_fused, output_no_lora, atol=1e-3, rtol=1e-3), "Fused lora should change the output" + ) def test_simple_inference_with_text_denoiser_lora_unloaded(self): """ Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) - pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe.unload_lora_weights() - # unloading should remove the LoRA layers - self.assertFalse( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder" - ) - self.assertFalse(check_if_lora_correctly_set(denoiser), "Lora not correctly unloaded in denoiser") + pipe.unload_lora_weights() + # unloading should remove the LoRA layers + self.assertFalse(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly unloaded in text encoder") + self.assertFalse(check_if_lora_correctly_set(denoiser), "Lora not correctly unloaded in denoiser") - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertFalse( - check_if_lora_correctly_set(pipe.text_encoder_2), - "Lora not correctly unloaded in text encoder 2", - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + self.assertFalse( + check_if_lora_correctly_set(pipe.text_encoder_2), + "Lora not correctly unloaded in text encoder 2", + ) - output_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(output_unloaded, output_no_lora, atol=1e-3, rtol=1e-3), - "Fused lora should change the output", - ) + output_unloaded = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(output_unloaded, output_no_lora, atol=1e-3, rtol=1e-3), + "Fused lora should change the output", + ) def test_simple_inference_with_text_denoiser_lora_unfused( self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3 @@ -885,125 +855,120 @@ class PeftLoraLoaderMixinTests: Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, denoiser = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules) - self.assertTrue(pipe.num_fused_loras == 1, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - output_fused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules) + self.assertTrue(pipe.num_fused_loras == 1, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + output_fused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) - self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - output_unfused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) + self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + output_unfused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - # unloading should remove the LoRA layers - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers") + # unloading should remove the LoRA layers + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Unfuse should still keep LoRA layers") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Unfuse should still keep LoRA layers") - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers" - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + self.assertTrue( + check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers" + ) - # Fuse and unfuse should lead to the same results - self.assertTrue( - np.allclose(output_fused_lora, output_unfused_lora, atol=expected_atol, rtol=expected_rtol), - "Fused lora should not change the output", - ) + # Fuse and unfuse should lead to the same results + self.assertTrue( + np.allclose(output_fused_lora, output_unfused_lora, atol=expected_atol, rtol=expected_rtol), + "Fused lora should not change the output", + ) def test_simple_inference_with_text_denoiser_multi_adapter(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + pipe.set_adapters("adapter-1") + output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_no_lora, output_adapter_1, atol=1e-3, rtol=1e-3), + "Adapter outputs should be different.", + ) - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + pipe.set_adapters("adapter-2") + output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_no_lora, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter outputs should be different.", + ) - pipe.set_adapters("adapter-1") - output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_no_lora, output_adapter_1, atol=1e-3, rtol=1e-3), - "Adapter outputs should be different.", - ) + pipe.set_adapters(["adapter-1", "adapter-2"]) + output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_no_lora, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter outputs should be different.", + ) - pipe.set_adapters("adapter-2") - output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_no_lora, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter outputs should be different.", - ) + # Fuse and unfuse should lead to the same results + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter 1 and 2 should give different results", + ) - pipe.set_adapters(["adapter-1", "adapter-2"]) - output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_no_lora, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter outputs should be different.", - ) + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 1 and mixed adapters should give different results", + ) - # Fuse and unfuse should lead to the same results - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter 1 and 2 should give different results", - ) + self.assertFalse( + np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 2 and mixed adapters should give different results", + ) - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 1 and mixed adapters should give different results", - ) + pipe.disable_lora() + output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 2 and mixed adapters should give different results", - ) - - pipe.disable_lora() - output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) def test_wrong_adapter_name_raises_error(self): adapter_name = "adapter-1" - scheduler_cls = self.scheduler_classes[0] - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1024,8 +989,7 @@ class PeftLoraLoaderMixinTests: def test_multiple_wrong_adapter_name_raises_error(self): adapter_name = "adapter-1" - scheduler_cls = self.scheduler_classes[0] - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1054,131 +1018,127 @@ class PeftLoraLoaderMixinTests: Tests a simple inference with lora attached to text encoder and unet, attaches one adapter and set different weights for different blocks (i.e. block lora) """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config) - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config) + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + self.assertTrue( + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" + ) - weights_1 = {"text_encoder": 2, "unet": {"down": 5}} - pipe.set_adapters("adapter-1", weights_1) - output_weights_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + weights_1 = {"text_encoder": 2, "unet": {"down": 5}} + pipe.set_adapters("adapter-1", weights_1) + output_weights_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - weights_2 = {"unet": {"up": 5}} - pipe.set_adapters("adapter-1", weights_2) - output_weights_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] + weights_2 = {"unet": {"up": 5}} + pipe.set_adapters("adapter-1", weights_2) + output_weights_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_weights_1, output_weights_2, atol=1e-3, rtol=1e-3), - "LoRA weights 1 and 2 should give different results", - ) - self.assertFalse( - np.allclose(output_no_lora, output_weights_1, atol=1e-3, rtol=1e-3), - "No adapter and LoRA weights 1 should give different results", - ) - self.assertFalse( - np.allclose(output_no_lora, output_weights_2, atol=1e-3, rtol=1e-3), - "No adapter and LoRA weights 2 should give different results", - ) + self.assertFalse( + np.allclose(output_weights_1, output_weights_2, atol=1e-3, rtol=1e-3), + "LoRA weights 1 and 2 should give different results", + ) + self.assertFalse( + np.allclose(output_no_lora, output_weights_1, atol=1e-3, rtol=1e-3), + "No adapter and LoRA weights 1 should give different results", + ) + self.assertFalse( + np.allclose(output_no_lora, output_weights_2, atol=1e-3, rtol=1e-3), + "No adapter and LoRA weights 2 should give different results", + ) - pipe.disable_lora() - output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.disable_lora() + output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) def test_simple_inference_with_text_denoiser_multi_adapter_block_lora(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set different weights for different blocks (i.e. block lora) """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + scales_1 = {"text_encoder": 2, "unet": {"down": 5}} + scales_2 = {"unet": {"down": 5, "mid": 5}} - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + pipe.set_adapters("adapter-1", scales_1) + output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - scales_1 = {"text_encoder": 2, "unet": {"down": 5}} - scales_2 = {"unet": {"down": 5, "mid": 5}} + pipe.set_adapters("adapter-2", scales_2) + output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-1", scales_1) - output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1, scales_2]) + output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-2", scales_2) - output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] + # Fuse and unfuse should lead to the same results + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter 1 and 2 should give different results", + ) - pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1, scales_2]) - output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 1 and mixed adapters should give different results", + ) - # Fuse and unfuse should lead to the same results - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter 1 and 2 should give different results", - ) + self.assertFalse( + np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 2 and mixed adapters should give different results", + ) - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 1 and mixed adapters should give different results", - ) + pipe.disable_lora() + output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 2 and mixed adapters should give different results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) - pipe.disable_lora() - output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) - - # a mismatching number of adapter_names and adapter_weights should raise an error - with self.assertRaises(ValueError): - pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1]) + # a mismatching number of adapter_names and adapter_weights should raise an error + with self.assertRaises(ValueError): + pipe.set_adapters(["adapter-1", "adapter-2"], [scales_1]) def test_simple_inference_with_text_denoiser_block_scale_for_all_dict_options(self): """Tests that any valid combination of lora block scales can be used in pipe.set_adapter""" @@ -1274,170 +1234,164 @@ class PeftLoraLoaderMixinTests: Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set/delete them """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + if self.has_two_text_encoders or self.has_three_text_encoders: + lora_loadable_components = self.pipeline_class._lora_loadable_modules + if "text_encoder_2" in lora_loadable_components: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + pipe.set_adapters("adapter-1") + output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - if self.has_two_text_encoders or self.has_three_text_encoders: - lora_loadable_components = self.pipeline_class._lora_loadable_modules - if "text_encoder_2" in lora_loadable_components: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + pipe.set_adapters("adapter-2") + output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-1") - output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.set_adapters(["adapter-1", "adapter-2"]) + output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-2") - output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter 1 and 2 should give different results", + ) - pipe.set_adapters(["adapter-1", "adapter-2"]) - output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 1 and mixed adapters should give different results", + ) - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter 1 and 2 should give different results", - ) + self.assertFalse( + np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 2 and mixed adapters should give different results", + ) - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 1 and mixed adapters should give different results", - ) + pipe.delete_adapters("adapter-1") + output_deleted_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 2 and mixed adapters should give different results", - ) + self.assertTrue( + np.allclose(output_deleted_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter 1 and 2 should give different results", + ) - pipe.delete_adapters("adapter-1") - output_deleted_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.delete_adapters("adapter-2") + output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(output_deleted_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter 1 and 2 should give different results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) - pipe.delete_adapters("adapter-2") - output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0] + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - self.assertTrue( - np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + pipe.set_adapters(["adapter-1", "adapter-2"]) + pipe.delete_adapters(["adapter-1", "adapter-2"]) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters(["adapter-1", "adapter-2"]) - pipe.delete_adapters(["adapter-1", "adapter-2"]) - - output_deleted_adapters = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_deleted_adapters, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) def test_simple_inference_with_text_denoiser_multi_adapter_weighted(self): """ Tests a simple inference with lora attached to text encoder and unet, attaches multiple adapters and set them """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + if self.has_two_text_encoders or self.has_three_text_encoders: + lora_loadable_components = self.pipeline_class._lora_loadable_modules + if "text_encoder_2" in lora_loadable_components: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + pipe.set_adapters("adapter-1") + output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - if self.has_two_text_encoders or self.has_three_text_encoders: - lora_loadable_components = self.pipeline_class._lora_loadable_modules - if "text_encoder_2" in lora_loadable_components: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + pipe.set_adapters("adapter-2") + output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-1") - output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.set_adapters(["adapter-1", "adapter-2"]) + output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] - pipe.set_adapters("adapter-2") - output_adapter_2 = pipe(**inputs, generator=torch.manual_seed(0))[0] + # Fuse and unfuse should lead to the same results + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), + "Adapter 1 and 2 should give different results", + ) - pipe.set_adapters(["adapter-1", "adapter-2"]) - output_adapter_mixed = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse( + np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 1 and mixed adapters should give different results", + ) - # Fuse and unfuse should lead to the same results - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_2, atol=1e-3, rtol=1e-3), - "Adapter 1 and 2 should give different results", - ) + self.assertFalse( + np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Adapter 2 and mixed adapters should give different results", + ) - self.assertFalse( - np.allclose(output_adapter_1, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 1 and mixed adapters should give different results", - ) + pipe.set_adapters(["adapter-1", "adapter-2"], [0.5, 0.6]) + output_adapter_mixed_weighted = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_adapter_2, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Adapter 2 and mixed adapters should give different results", - ) + self.assertFalse( + np.allclose(output_adapter_mixed_weighted, output_adapter_mixed, atol=1e-3, rtol=1e-3), + "Weighted adapter and mixed adapter should give different results", + ) - pipe.set_adapters(["adapter-1", "adapter-2"], [0.5, 0.6]) - output_adapter_mixed_weighted = pipe(**inputs, generator=torch.manual_seed(0))[0] + pipe.disable_lora() + output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_adapter_mixed_weighted, output_adapter_mixed, atol=1e-3, rtol=1e-3), - "Weighted adapter and mixed adapter should give different results", - ) - - pipe.disable_lora() - output_disabled = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), - "output with no lora and output with lora disabled should give same results", - ) + self.assertTrue( + np.allclose(output_no_lora, output_disabled, atol=1e-3, rtol=1e-3), + "output with no lora and output with lora disabled should give same results", + ) @skip_mps @pytest.mark.xfail( @@ -1446,163 +1400,156 @@ class PeftLoraLoaderMixinTests: strict=False, ) def test_lora_fuse_nan(self): - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + # corrupt one LoRA weight with `inf` values + with torch.no_grad(): + if self.unet_kwargs: + pipe.unet.mid_block.attentions[0].transformer_blocks[0].attn1.to_q.lora_A["adapter-1"].weight += float( + "inf" ) + else: + named_modules = [name for name, _ in pipe.transformer.named_modules()] + possible_tower_names = [ + "transformer_blocks", + "blocks", + "joint_transformer_blocks", + "single_transformer_blocks", + ] + filtered_tower_names = [ + tower_name for tower_name in possible_tower_names if hasattr(pipe.transformer, tower_name) + ] + if len(filtered_tower_names) == 0: + reason = f"`pipe.transformer` didn't have any of the following attributes: {possible_tower_names}." + raise ValueError(reason) + for tower_name in filtered_tower_names: + transformer_tower = getattr(pipe.transformer, tower_name) + has_attn1 = any("attn1" in name for name in named_modules) + if has_attn1: + transformer_tower[0].attn1.to_q.lora_A["adapter-1"].weight += float("inf") + else: + transformer_tower[0].attn.to_q.lora_A["adapter-1"].weight += float("inf") - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + # with `safe_fusing=True` we should see an Error + with self.assertRaises(ValueError): + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True) - # corrupt one LoRA weight with `inf` values - with torch.no_grad(): - if self.unet_kwargs: - pipe.unet.mid_block.attentions[0].transformer_blocks[0].attn1.to_q.lora_A[ - "adapter-1" - ].weight += float("inf") - else: - named_modules = [name for name, _ in pipe.transformer.named_modules()] - possible_tower_names = [ - "transformer_blocks", - "blocks", - "joint_transformer_blocks", - "single_transformer_blocks", - ] - filtered_tower_names = [ - tower_name for tower_name in possible_tower_names if hasattr(pipe.transformer, tower_name) - ] - if len(filtered_tower_names) == 0: - reason = ( - f"`pipe.transformer` didn't have any of the following attributes: {possible_tower_names}." - ) - raise ValueError(reason) - for tower_name in filtered_tower_names: - transformer_tower = getattr(pipe.transformer, tower_name) - has_attn1 = any("attn1" in name for name in named_modules) - if has_attn1: - transformer_tower[0].attn1.to_q.lora_A["adapter-1"].weight += float("inf") - else: - transformer_tower[0].attn.to_q.lora_A["adapter-1"].weight += float("inf") + # without we should not see an error, but every image will be black + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False) + out = pipe(**inputs)[0] - # with `safe_fusing=True` we should see an Error - with self.assertRaises(ValueError): - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=True) - - # without we should not see an error, but every image will be black - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, safe_fusing=False) - out = pipe(**inputs)[0] - - self.assertTrue(np.isnan(out).all()) + self.assertTrue(np.isnan(out).all()) def test_get_adapters(self): """ Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") - adapter_names = pipe.get_active_adapters() - self.assertListEqual(adapter_names, ["adapter-1"]) + adapter_names = pipe.get_active_adapters() + self.assertListEqual(adapter_names, ["adapter-1"]) - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") - adapter_names = pipe.get_active_adapters() - self.assertListEqual(adapter_names, ["adapter-2"]) + adapter_names = pipe.get_active_adapters() + self.assertListEqual(adapter_names, ["adapter-2"]) - pipe.set_adapters(["adapter-1", "adapter-2"]) - self.assertListEqual(pipe.get_active_adapters(), ["adapter-1", "adapter-2"]) + pipe.set_adapters(["adapter-1", "adapter-2"]) + self.assertListEqual(pipe.get_active_adapters(), ["adapter-1", "adapter-2"]) def test_get_list_adapters(self): """ Tests a simple usecase where we attach multiple adapters and check if the results are the expected results """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) - # 1. - dicts_to_be_checked = {} - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - dicts_to_be_checked = {"text_encoder": ["adapter-1"]} + # 1. + dicts_to_be_checked = {} + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + dicts_to_be_checked = {"text_encoder": ["adapter-1"]} - if self.unet_kwargs is not None: - pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") - dicts_to_be_checked.update({"unet": ["adapter-1"]}) - else: - pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") - dicts_to_be_checked.update({"transformer": ["adapter-1"]}) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-1") + dicts_to_be_checked.update({"unet": ["adapter-1"]}) + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-1") + dicts_to_be_checked.update({"transformer": ["adapter-1"]}) - self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) + self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) - # 2. - dicts_to_be_checked = {} - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") - dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + # 2. + dicts_to_be_checked = {} + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} - if self.unet_kwargs is not None: - pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") - dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) - else: - pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") - dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-2") + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-2") + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) - self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) + self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) - # 3. - pipe.set_adapters(["adapter-1", "adapter-2"]) + # 3. + pipe.set_adapters(["adapter-1", "adapter-2"]) - dicts_to_be_checked = {} - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + dicts_to_be_checked = {} + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} - if self.unet_kwargs is not None: - dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) - else: - dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) + if self.unet_kwargs is not None: + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2"]}) + else: + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2"]}) - self.assertDictEqual( - pipe.get_list_adapters(), - dicts_to_be_checked, - ) + self.assertDictEqual( + pipe.get_list_adapters(), + dicts_to_be_checked, + ) - # 4. - dicts_to_be_checked = {} - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} + # 4. + dicts_to_be_checked = {} + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + dicts_to_be_checked = {"text_encoder": ["adapter-1", "adapter-2"]} - if self.unet_kwargs is not None: - pipe.unet.add_adapter(denoiser_lora_config, "adapter-3") - dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2", "adapter-3"]}) - else: - pipe.transformer.add_adapter(denoiser_lora_config, "adapter-3") - dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2", "adapter-3"]}) + if self.unet_kwargs is not None: + pipe.unet.add_adapter(denoiser_lora_config, "adapter-3") + dicts_to_be_checked.update({"unet": ["adapter-1", "adapter-2", "adapter-3"]}) + else: + pipe.transformer.add_adapter(denoiser_lora_config, "adapter-3") + dicts_to_be_checked.update({"transformer": ["adapter-1", "adapter-2", "adapter-3"]}) - self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) + self.assertDictEqual(pipe.get_list_adapters(), dicts_to_be_checked) @require_peft_version_greater(peft_version="0.6.2") def test_simple_inference_with_text_lora_denoiser_fused_multi( @@ -1612,8 +1559,83 @@ class PeftLoraLoaderMixinTests: Tests a simple inference with lora attached into text encoder + fuses the lora weights into base model and makes sure it works as expected - with unet and multi-adapter case """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) + + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) + + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config, "adapter-1") + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + denoiser.add_adapter(denoiser_lora_config, "adapter-2") + + if self.has_two_text_encoders or self.has_three_text_encoders: + lora_loadable_components = self.pipeline_class._lora_loadable_modules + if "text_encoder_2" in lora_loadable_components: + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") + self.assertTrue( + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" + ) + pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") + + # set them to multi-adapter inference mode + pipe.set_adapters(["adapter-1", "adapter-2"]) + outputs_all_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + + pipe.set_adapters(["adapter-1"]) + outputs_lora_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-1"]) + self.assertTrue(pipe.num_fused_loras == 1, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + + # Fusing should still keep the LoRA layers so output should remain the same + outputs_lora_1_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] + + self.assertTrue( + np.allclose(outputs_lora_1, outputs_lora_1_fused, atol=expected_atol, rtol=expected_rtol), + "Fused lora should not change the output", + ) + + pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) + self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers") + + self.assertTrue(check_if_lora_correctly_set(denoiser), "Unfuse should still keep LoRA layers") + + if self.has_two_text_encoders or self.has_three_text_encoders: + if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: + self.assertTrue( + check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers" + ) + + pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-2", "adapter-1"]) + self.assertTrue(pipe.num_fused_loras == 2, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + + # Fusing should still keep the LoRA layers + output_all_lora_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + np.allclose(output_all_lora_fused, outputs_all_lora, atol=expected_atol, rtol=expected_rtol), + "Fused lora should not change the output", + ) + pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) + self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") + + def test_lora_scale_kwargs_match_fusion(self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3): + attention_kwargs_name = determine_attention_kwargs_name(self.pipeline_class) + + for lora_scale in [1.0, 0.8]: + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1627,150 +1649,65 @@ class PeftLoraLoaderMixinTests: self.assertTrue( check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" ) - pipe.text_encoder.add_adapter(text_lora_config, "adapter-2") denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet denoiser.add_adapter(denoiser_lora_config, "adapter-1") self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - denoiser.add_adapter(denoiser_lora_config, "adapter-2") if self.has_two_text_encoders or self.has_three_text_encoders: lora_loadable_components = self.pipeline_class._lora_loadable_modules if "text_encoder_2" in lora_loadable_components: pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" + check_if_lora_correctly_set(pipe.text_encoder_2), + "Lora not correctly set in text encoder 2", ) - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-2") - - # set them to multi-adapter inference mode - pipe.set_adapters(["adapter-1", "adapter-2"]) - outputs_all_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] pipe.set_adapters(["adapter-1"]) - outputs_lora_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + attention_kwargs = {attention_kwargs_name: {"scale": lora_scale}} + outputs_lora_1 = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-1"]) + pipe.fuse_lora( + components=self.pipeline_class._lora_loadable_modules, + adapter_names=["adapter-1"], + lora_scale=lora_scale, + ) self.assertTrue(pipe.num_fused_loras == 1, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - # Fusing should still keep the LoRA layers so output should remain the same outputs_lora_1_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] self.assertTrue( np.allclose(outputs_lora_1, outputs_lora_1_fused, atol=expected_atol, rtol=expected_rtol), "Fused lora should not change the output", ) - - pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) - self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Unfuse should still keep LoRA layers") - - self.assertTrue(check_if_lora_correctly_set(denoiser), "Unfuse should still keep LoRA layers") - - if self.has_two_text_encoders or self.has_three_text_encoders: - if "text_encoder_2" in self.pipeline_class._lora_loadable_modules: - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Unfuse should still keep LoRA layers" - ) - - pipe.fuse_lora( - components=self.pipeline_class._lora_loadable_modules, adapter_names=["adapter-2", "adapter-1"] + self.assertFalse( + np.allclose(output_no_lora, outputs_lora_1, atol=expected_atol, rtol=expected_rtol), + "LoRA should change the output", ) - self.assertTrue(pipe.num_fused_loras == 2, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - - # Fusing should still keep the LoRA layers - output_all_lora_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue( - np.allclose(output_all_lora_fused, outputs_all_lora, atol=expected_atol, rtol=expected_rtol), - "Fused lora should not change the output", - ) - pipe.unfuse_lora(components=self.pipeline_class._lora_loadable_modules) - self.assertTrue(pipe.num_fused_loras == 0, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - - def test_lora_scale_kwargs_match_fusion(self, expected_atol: float = 1e-3, expected_rtol: float = 1e-3): - attention_kwargs_name = determine_attention_kwargs_name(self.pipeline_class) - - for lora_scale in [1.0, 0.8]: - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) - - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) - - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config, "adapter-1") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" - ) - - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config, "adapter-1") - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") - - if self.has_two_text_encoders or self.has_three_text_encoders: - lora_loadable_components = self.pipeline_class._lora_loadable_modules - if "text_encoder_2" in lora_loadable_components: - pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1") - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), - "Lora not correctly set in text encoder 2", - ) - - pipe.set_adapters(["adapter-1"]) - attention_kwargs = {attention_kwargs_name: {"scale": lora_scale}} - outputs_lora_1 = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - - pipe.fuse_lora( - components=self.pipeline_class._lora_loadable_modules, - adapter_names=["adapter-1"], - lora_scale=lora_scale, - ) - self.assertTrue(pipe.num_fused_loras == 1, f"{pipe.num_fused_loras=}, {pipe.fused_loras=}") - - outputs_lora_1_fused = pipe(**inputs, generator=torch.manual_seed(0))[0] - - self.assertTrue( - np.allclose(outputs_lora_1, outputs_lora_1_fused, atol=expected_atol, rtol=expected_rtol), - "Fused lora should not change the output", - ) - self.assertFalse( - np.allclose(output_no_lora, outputs_lora_1, atol=expected_atol, rtol=expected_rtol), - "LoRA should change the output", - ) @require_peft_version_greater(peft_version="0.9.0") def test_simple_inference_with_dora(self): - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components( - scheduler_cls, use_dora=True - ) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(use_dora=True) + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_dora_lora.shape == self.output_shape) + output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_dora_lora.shape == self.output_shape) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse( - np.allclose(output_dora_lora, output_no_dora_lora, atol=1e-3, rtol=1e-3), - "DoRA lora should change the output", - ) + self.assertFalse( + np.allclose(output_dora_lora, output_no_dora_lora, atol=1e-3, rtol=1e-3), + "DoRA lora should change the output", + ) def test_missing_keys_warning(self): - scheduler_cls = self.scheduler_classes[0] # Skip text encoder check for now as that is handled with `transformers`. - components, _, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1805,9 +1742,8 @@ class PeftLoraLoaderMixinTests: self.assertTrue(missing_key.replace(f"{component}.", "") in cap_logger.out.replace("default_0.", "")) def test_unexpected_keys_warning(self): - scheduler_cls = self.scheduler_classes[0] # Skip text encoder check for now as that is handled with `transformers`. - components, _, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1842,23 +1778,22 @@ class PeftLoraLoaderMixinTests: Tests a simple inference with lora attached to text encoder and unet, then unloads the lora weights and makes sure it works as expected """ - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True) - if self.has_two_text_encoders or self.has_three_text_encoders: - pipe.text_encoder_2 = torch.compile(pipe.text_encoder_2, mode="reduce-overhead", fullgraph=True) + if self.has_two_text_encoders or self.has_three_text_encoders: + pipe.text_encoder_2 = torch.compile(pipe.text_encoder_2, mode="reduce-overhead", fullgraph=True) - # Just makes sure it works.. - _ = pipe(**inputs, generator=torch.manual_seed(0))[0] + # Just makes sure it works. + _ = pipe(**inputs, generator=torch.manual_seed(0))[0] def test_modify_padding_mode(self): def set_pad_mode(network, mode="circular"): @@ -1866,22 +1801,20 @@ class PeftLoraLoaderMixinTests: if isinstance(module, torch.nn.Conv2d): module.padding_mode = mode - for scheduler_cls in self.scheduler_classes: - components, _, _ = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _pad_mode = "circular" - set_pad_mode(pipe.vae, _pad_mode) - set_pad_mode(pipe.unet, _pad_mode) + components, _, _ = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _pad_mode = "circular" + set_pad_mode(pipe.vae, _pad_mode) + set_pad_mode(pipe.unet, _pad_mode) - _, _, inputs = self.get_dummy_inputs() - _ = pipe(**inputs)[0] + _, _, inputs = self.get_dummy_inputs() + _ = pipe(**inputs)[0] def test_logs_info_when_no_lora_keys_found(self): - scheduler_cls = self.scheduler_classes[0] # Skip text encoder check for now as that is handled with `transformers`. - components, _, _ = self.get_dummy_components(scheduler_cls) + components, _, _ = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -1925,73 +1858,71 @@ class PeftLoraLoaderMixinTests: def test_set_adapters_match_attention_kwargs(self): """Test to check if outputs after `set_adapters()` and attention kwargs match.""" attention_kwargs_name = determine_attention_kwargs_name(self.pipeline_class) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(output_no_lora.shape == self.output_shape) + + pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) + + lora_scale = 0.5 + attention_kwargs = {attention_kwargs_name: {"scale": lora_scale}} + output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] + self.assertFalse( + np.allclose(output_no_lora, output_lora_scale, atol=1e-3, rtol=1e-3), + "Lora + scale should change the output", + ) + + pipe.set_adapters("default", lora_scale) + output_lora_scale_wo_kwargs = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue( + not np.allclose(output_no_lora, output_lora_scale_wo_kwargs, atol=1e-3, rtol=1e-3), + "Lora + scale should change the output", + ) + self.assertTrue( + np.allclose(output_lora_scale, output_lora_scale_wo_kwargs, atol=1e-3, rtol=1e-3), + "Lora + scale should match the output of `set_adapters()`.", + ) + + with tempfile.TemporaryDirectory() as tmpdirname: + modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) + lora_state_dicts = self._get_lora_state_dicts(modules_to_save) + self.pipeline_class.save_lora_weights( + save_directory=tmpdirname, safe_serialization=True, **lora_state_dicts + ) + + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(output_no_lora.shape == self.output_shape) + for module_name, module in modules_to_save.items(): + self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - pipe, _ = self.add_adapters_to_pipeline(pipe, text_lora_config, denoiser_lora_config) - - lora_scale = 0.5 - attention_kwargs = {attention_kwargs_name: {"scale": lora_scale}} - output_lora_scale = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - self.assertFalse( - np.allclose(output_no_lora, output_lora_scale, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) - - pipe.set_adapters("default", lora_scale) - output_lora_scale_wo_kwargs = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] self.assertTrue( - not np.allclose(output_no_lora, output_lora_scale_wo_kwargs, atol=1e-3, rtol=1e-3), + not np.allclose(output_no_lora, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), "Lora + scale should change the output", ) self.assertTrue( - np.allclose(output_lora_scale, output_lora_scale_wo_kwargs, atol=1e-3, rtol=1e-3), - "Lora + scale should match the output of `set_adapters()`.", + np.allclose(output_lora_scale, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results as attention_kwargs.", + ) + self.assertTrue( + np.allclose(output_lora_scale_wo_kwargs, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), + "Loading from saved checkpoints should give same results as set_adapters().", ) - - with tempfile.TemporaryDirectory() as tmpdirname: - modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) - lora_state_dicts = self._get_lora_state_dicts(modules_to_save) - self.pipeline_class.save_lora_weights( - save_directory=tmpdirname, safe_serialization=True, **lora_state_dicts - ) - - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.load_lora_weights(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")) - - for module_name, module in modules_to_save.items(): - self.assertTrue(check_if_lora_correctly_set(module), f"Lora not correctly set in {module_name}") - - output_lora_from_pretrained = pipe(**inputs, generator=torch.manual_seed(0), **attention_kwargs)[0] - self.assertTrue( - not np.allclose(output_no_lora, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Lora + scale should change the output", - ) - self.assertTrue( - np.allclose(output_lora_scale, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results as attention_kwargs.", - ) - self.assertTrue( - np.allclose(output_lora_scale_wo_kwargs, output_lora_from_pretrained, atol=1e-3, rtol=1e-3), - "Loading from saved checkpoints should give same results as set_adapters().", - ) @require_peft_version_greater("0.13.2") def test_lora_B_bias(self): # Currently, this test is only relevant for Flux Control LoRA as we are not # aware of any other LoRA checkpoint that has its `lora_B` biases trained. - components, _, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -2028,7 +1959,7 @@ class PeftLoraLoaderMixinTests: self.assertFalse(np.allclose(lora_bias_false_output, lora_bias_true_output, atol=1e-3, rtol=1e-3)) def test_correct_lora_configs_with_different_ranks(self): - components, _, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -2114,7 +2045,7 @@ class PeftLoraLoaderMixinTests: self.assertEqual(submodule.bias.dtype, dtype_to_check) def initialize_pipeline(storage_dtype=None, compute_dtype=torch.float32): - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device, dtype=compute_dtype) pipe.set_progress_bar_config(disable=None) @@ -2181,7 +2112,7 @@ class PeftLoraLoaderMixinTests: self.assertTrue(module._diffusers_hook.get_hook(_PEFT_AUTOCAST_DISABLE_HOOK) is not None) # 1. Test forward with add_adapter - components, _, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device, dtype=compute_dtype) pipe.set_progress_bar_config(disable=None) @@ -2211,7 +2142,7 @@ class PeftLoraLoaderMixinTests: ) self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - components, _, _ = self.get_dummy_components(self.scheduler_classes[0]) + components, _, _ = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device, dtype=compute_dtype) pipe.set_progress_bar_config(disable=None) @@ -2231,10 +2162,7 @@ class PeftLoraLoaderMixinTests: @parameterized.expand([4, 8, 16]) def test_lora_adapter_metadata_is_loaded_correctly(self, lora_alpha): - scheduler_cls = self.scheduler_classes[0] - components, text_lora_config, denoiser_lora_config = self.get_dummy_components( - scheduler_cls, lora_alpha=lora_alpha - ) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(lora_alpha=lora_alpha) pipe = self.pipeline_class(**components) pipe, _ = self.add_adapters_to_pipeline( @@ -2280,10 +2208,7 @@ class PeftLoraLoaderMixinTests: @parameterized.expand([4, 8, 16]) def test_lora_adapter_metadata_save_load_inference(self, lora_alpha): - scheduler_cls = self.scheduler_classes[0] - components, text_lora_config, denoiser_lora_config = self.get_dummy_components( - scheduler_cls, lora_alpha=lora_alpha - ) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components(lora_alpha=lora_alpha) pipe = self.pipeline_class(**components).to(torch_device) _, _, inputs = self.get_dummy_inputs(with_generator=False) @@ -2311,8 +2236,7 @@ class PeftLoraLoaderMixinTests: def test_lora_unload_add_adapter(self): """Tests if `unload_lora_weights()` -> `add_adapter()` works.""" - scheduler_cls = self.scheduler_classes[0] - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components).to(torch_device) _, _, inputs = self.get_dummy_inputs(with_generator=False) @@ -2330,51 +2254,48 @@ class PeftLoraLoaderMixinTests: def test_inference_load_delete_load_adapters(self): "Tests if `load_lora_weights()` -> `delete_adapters()` -> `load_lora_weights()` works." - for scheduler_cls in self.scheduler_classes: - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls) - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - _, _, inputs = self.get_dummy_inputs(with_generator=False) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + _, _, inputs = self.get_dummy_inputs(with_generator=False) - output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] + output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0] - if "text_encoder" in self.pipeline_class._lora_loadable_modules: - pipe.text_encoder.add_adapter(text_lora_config) + if "text_encoder" in self.pipeline_class._lora_loadable_modules: + pipe.text_encoder.add_adapter(text_lora_config) + self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder") + + denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet + denoiser.add_adapter(denoiser_lora_config) + self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + + if self.has_two_text_encoders or self.has_three_text_encoders: + lora_loadable_components = self.pipeline_class._lora_loadable_modules + if "text_encoder_2" in lora_loadable_components: + pipe.text_encoder_2.add_adapter(text_lora_config) self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder" + check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" ) - denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet - denoiser.add_adapter(denoiser_lora_config) - self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.") + output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] - if self.has_two_text_encoders or self.has_three_text_encoders: - lora_loadable_components = self.pipeline_class._lora_loadable_modules - if "text_encoder_2" in lora_loadable_components: - pipe.text_encoder_2.add_adapter(text_lora_config) - self.assertTrue( - check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2" - ) + with tempfile.TemporaryDirectory() as tmpdirname: + modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) + lora_state_dicts = self._get_lora_state_dicts(modules_to_save) + self.pipeline_class.save_lora_weights(save_directory=tmpdirname, **lora_state_dicts) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - output_adapter_1 = pipe(**inputs, generator=torch.manual_seed(0))[0] + # First, delete adapter and compare. + pipe.delete_adapters(pipe.get_active_adapters()[0]) + output_no_adapter = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertFalse(np.allclose(output_adapter_1, output_no_adapter, atol=1e-3, rtol=1e-3)) + self.assertTrue(np.allclose(output_no_lora, output_no_adapter, atol=1e-3, rtol=1e-3)) - with tempfile.TemporaryDirectory() as tmpdirname: - modules_to_save = self._get_modules_to_save(pipe, has_denoiser=True) - lora_state_dicts = self._get_lora_state_dicts(modules_to_save) - self.pipeline_class.save_lora_weights(save_directory=tmpdirname, **lora_state_dicts) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - - # First, delete adapter and compare. - pipe.delete_adapters(pipe.get_active_adapters()[0]) - output_no_adapter = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertFalse(np.allclose(output_adapter_1, output_no_adapter, atol=1e-3, rtol=1e-3)) - self.assertTrue(np.allclose(output_no_lora, output_no_adapter, atol=1e-3, rtol=1e-3)) - - # Then load adapter and compare. - pipe.load_lora_weights(tmpdirname) - output_lora_loaded = pipe(**inputs, generator=torch.manual_seed(0))[0] - self.assertTrue(np.allclose(output_adapter_1, output_lora_loaded, atol=1e-3, rtol=1e-3)) + # Then load adapter and compare. + pipe.load_lora_weights(tmpdirname) + output_lora_loaded = pipe(**inputs, generator=torch.manual_seed(0))[0] + self.assertTrue(np.allclose(output_adapter_1, output_lora_loaded, atol=1e-3, rtol=1e-3)) def _test_group_offloading_inference_denoiser(self, offload_type, use_stream): from diffusers.hooks.group_offloading import _get_top_level_group_offload_hook @@ -2382,7 +2303,7 @@ class PeftLoraLoaderMixinTests: onload_device = torch_device offload_device = torch.device("cpu") - components, text_lora_config, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, text_lora_config, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -2399,7 +2320,7 @@ class PeftLoraLoaderMixinTests: ) self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - components, _, _ = self.get_dummy_components(self.scheduler_classes[0]) + components, _, _ = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe.set_progress_bar_config(disable=None) denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet @@ -2451,7 +2372,7 @@ class PeftLoraLoaderMixinTests: @require_torch_accelerator def test_lora_loading_model_cpu_offload(self): - components, _, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, _, denoiser_lora_config = self.get_dummy_components() _, _, inputs = self.get_dummy_inputs(with_generator=False) pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) @@ -2470,7 +2391,7 @@ class PeftLoraLoaderMixinTests: save_directory=tmpdirname, safe_serialization=True, **lora_state_dicts ) # reinitialize the pipeline to mimic the inference workflow. - components, _, denoiser_lora_config = self.get_dummy_components(self.scheduler_classes[0]) + components, _, denoiser_lora_config = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe.enable_model_cpu_offload(device=torch_device) pipe.load_lora_weights(tmpdirname) From 7c54a7b38a9ce42db7c6d8ec86ec656cac9ee216 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Wed, 24 Sep 2025 05:23:41 +0200 Subject: [PATCH 08/18] Fix Custom Code loading (#12378) * update * update * update --- src/diffusers/models/auto_model.py | 2 +- src/diffusers/modular_pipelines/modular_pipeline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/auto_model.py b/src/diffusers/models/auto_model.py index ada0d54e54..47f3a992b3 100644 --- a/src/diffusers/models/auto_model.py +++ b/src/diffusers/models/auto_model.py @@ -194,7 +194,7 @@ class AutoModel(ConfigMixin): has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"] trust_remote_code = resolve_trust_remote_code(trust_remote_code, pretrained_model_or_path, has_remote_code) - if not (has_remote_code and trust_remote_code): + if not has_remote_code and trust_remote_code: raise ValueError( "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file." ) diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py index 78226a49b1..74ffc62348 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline.py +++ b/src/diffusers/modular_pipelines/modular_pipeline.py @@ -323,7 +323,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin): trust_remote_code = resolve_trust_remote_code( trust_remote_code, pretrained_model_name_or_path, has_remote_code ) - if not (has_remote_code and trust_remote_code): + if not has_remote_code and trust_remote_code: raise ValueError( "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file." ) From 9ef118509e1a2682bb6870533cc3552726284167 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 24 Sep 2025 09:02:25 +0530 Subject: [PATCH 09/18] [tests] disable xformer tests for pipelines it isn't popular. (#12277) disable xformer tests for pipelines it isn't popular. --- tests/pipelines/easyanimate/test_easyanimate.py | 1 + tests/pipelines/hidream_image/test_pipeline_hidream.py | 2 +- tests/pipelines/omnigen/test_pipeline_omnigen.py | 2 +- tests/pipelines/qwenimage/test_qwenimage_controlnet.py | 3 +-- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/easyanimate/test_easyanimate.py b/tests/pipelines/easyanimate/test_easyanimate.py index 2dbb8639f1..5cb2a232bb 100644 --- a/tests/pipelines/easyanimate/test_easyanimate.py +++ b/tests/pipelines/easyanimate/test_easyanimate.py @@ -48,6 +48,7 @@ class EasyAnimatePipelineFastTests(PipelineTesterMixin, unittest.TestCase): batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + test_xformers_attention = False required_optional_params = frozenset( [ "num_inference_steps", diff --git a/tests/pipelines/hidream_image/test_pipeline_hidream.py b/tests/pipelines/hidream_image/test_pipeline_hidream.py index ec8d36e1d3..ddf39ba4c1 100644 --- a/tests/pipelines/hidream_image/test_pipeline_hidream.py +++ b/tests/pipelines/hidream_image/test_pipeline_hidream.py @@ -47,8 +47,8 @@ class HiDreamImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params + test_xformers_attention = False test_layerwise_casting = True supports_dduf = False diff --git a/tests/pipelines/omnigen/test_pipeline_omnigen.py b/tests/pipelines/omnigen/test_pipeline_omnigen.py index 28648aa76f..1a758b7050 100644 --- a/tests/pipelines/omnigen/test_pipeline_omnigen.py +++ b/tests/pipelines/omnigen/test_pipeline_omnigen.py @@ -22,7 +22,7 @@ class OmniGenPipelineFastTests(unittest.TestCase, PipelineTesterMixin): pipeline_class = OmniGenPipeline params = frozenset(["prompt", "guidance_scale"]) batch_params = frozenset(["prompt"]) - + test_xformers_attention = False test_layerwise_casting = True def get_dummy_components(self): diff --git a/tests/pipelines/qwenimage/test_qwenimage_controlnet.py b/tests/pipelines/qwenimage/test_qwenimage_controlnet.py index c78e5cb233..188106b49b 100644 --- a/tests/pipelines/qwenimage/test_qwenimage_controlnet.py +++ b/tests/pipelines/qwenimage/test_qwenimage_controlnet.py @@ -44,7 +44,6 @@ class QwenControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): batch_params = frozenset(["prompt", "negative_prompt", "control_image"]) image_params = frozenset(["control_image"]) image_latents_params = frozenset(["latents"]) - required_optional_params = frozenset( [ "num_inference_steps", @@ -59,7 +58,7 @@ class QwenControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ) supports_dduf = False - test_xformers_attention = True + test_xformers_attention = False test_layerwise_casting = True test_group_offloading = True From 7a587349943325e667866971a36996a56fcff143 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 23 Sep 2025 21:01:45 -0700 Subject: [PATCH 10/18] xpu enabling for 4 cases (#12345) Signed-off-by: Yao, Matrix --- .../modular_pipelines/components_manager.py | 13 ++++++++++--- tests/lora/test_lora_layers_hunyuanvideo.py | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/diffusers/modular_pipelines/components_manager.py b/src/diffusers/modular_pipelines/components_manager.py index f48a227e2e..ed847fa414 100644 --- a/src/diffusers/modular_pipelines/components_manager.py +++ b/src/diffusers/modular_pipelines/components_manager.py @@ -25,6 +25,7 @@ from ..utils import ( is_accelerate_available, logging, ) +from ..utils.torch_utils import get_device if is_accelerate_available(): @@ -161,7 +162,9 @@ class AutoOffloadStrategy: current_module_size = model.get_memory_footprint() - mem_on_device = torch.cuda.mem_get_info(execution_device.index)[0] + device_type = execution_device.type + device_module = getattr(torch, device_type, torch.cuda) + mem_on_device = device_module.mem_get_info(execution_device.index)[0] mem_on_device = mem_on_device - self.memory_reserve_margin if current_module_size < mem_on_device: return [] @@ -301,7 +304,7 @@ class ComponentsManager: cm.add("vae", vae_model, collection="sdxl") # Enable auto offloading - cm.enable_auto_cpu_offload(device="cuda") + cm.enable_auto_cpu_offload() # Retrieve components unet = cm.get_one(name="unet", collection="sdxl") @@ -490,6 +493,8 @@ class ComponentsManager: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() + if torch.xpu.is_available(): + torch.xpu.empty_cache() # YiYi TODO: rename to search_components for now, may remove this method def search_components( @@ -678,7 +683,7 @@ class ComponentsManager: return get_return_dict(matches, return_dict_with_names) - def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = "cuda", memory_reserve_margin="3GB"): + def enable_auto_cpu_offload(self, device: Union[str, int, torch.device] = None, memory_reserve_margin="3GB"): """ Enable automatic CPU offloading for all components. @@ -704,6 +709,8 @@ class ComponentsManager: self.disable_auto_cpu_offload() offload_strategy = AutoOffloadStrategy(memory_reserve_margin=memory_reserve_margin) + if device is None: + device = get_device() device = torch.device(device) if device.index is None: device = torch.device(f"{device.type}:{0}") diff --git a/tests/lora/test_lora_layers_hunyuanvideo.py b/tests/lora/test_lora_layers_hunyuanvideo.py index 7ea0f1fcc9..cfd5d3146a 100644 --- a/tests/lora/test_lora_layers_hunyuanvideo.py +++ b/tests/lora/test_lora_layers_hunyuanvideo.py @@ -253,6 +253,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase): expected_slices = Expectations( { ("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]), + ("xpu", 3): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]), } ) # fmt: on From 08c29020dd558899a53137441f908ab668c427d6 Mon Sep 17 00:00:00 2001 From: Yao Matrix Date: Tue, 23 Sep 2025 21:02:06 -0700 Subject: [PATCH 11/18] fix marigold ut case fail on xpu (#12350) Signed-off-by: Yao, Matrix --- .../pipelines/marigold/test_marigold_depth.py | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tests/pipelines/marigold/test_marigold_depth.py b/tests/pipelines/marigold/test_marigold_depth.py index 3e8ccbf5c0..3c85305992 100644 --- a/tests/pipelines/marigold/test_marigold_depth.py +++ b/tests/pipelines/marigold/test_marigold_depth.py @@ -33,6 +33,7 @@ from diffusers import ( ) from ...testing_utils import ( + Expectations, backend_empty_cache, enable_full_determinism, floats_tensor, @@ -356,7 +357,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f32_cuda_G0_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f32_accelerator_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=False, device=torch_device, @@ -369,7 +370,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -382,7 +383,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G2024_S1_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G2024_S1_P768_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -395,12 +396,23 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S2_P768_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S2_P768_E1_B1_M1(self): + # fmt: off + expected_slices = Expectations( + { + ("cuda", 7): np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), + ("xpu", 3): np.array([0.1084, 0.1096, 0.1108, 0.1080, 0.1083, 0.1080, + 0.1085, 0.1057, 0.0996]), + } + ) + expected_slice = expected_slices.get_expectation() + # fmt: on + self._test_marigold_depth( is_fp16=True, device=torch_device, generator_seed=0, - expected_slice=np.array([0.1085, 0.1098, 0.1110, 0.1081, 0.1085, 0.1082, 0.1085, 0.1057, 0.0996]), + expected_slice=expected_slice, num_inference_steps=2, processing_resolution=768, ensemble_size=1, @@ -408,7 +420,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -421,7 +433,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E3_B1_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E3_B1_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -435,7 +447,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P768_E4_B2_M1(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P768_E4_B2_M1(self): self._test_marigold_depth( is_fp16=True, device=torch_device, @@ -449,7 +461,7 @@ class MarigoldDepthPipelineIntegrationTests(unittest.TestCase): match_input_resolution=True, ) - def test_marigold_depth_einstein_f16_cuda_G0_S1_P512_E1_B1_M0(self): + def test_marigold_depth_einstein_f16_accelerator_G0_S1_P512_E1_B1_M0(self): self._test_marigold_depth( is_fp16=True, device=torch_device, From 043ab2520f6a19fce78e6e060a68dbc947edb9f9 Mon Sep 17 00:00:00 2001 From: Alberto Chimenti <43784986+albchim@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:45:04 +0200 Subject: [PATCH 12/18] Fix WanVACEPipeline to allow prompt to be None and skip encoding step (#12251) Fixed WanVACEPipeline to allow prompt to be None and skip encoding step --- src/diffusers/pipelines/wan/pipeline_wan_vace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py index eab1aacfc5..2b1890afec 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py @@ -795,7 +795,7 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs # Simplification of implementation for now - if not isinstance(prompt, str): + if prompt is not None and not isinstance(prompt, str): raise ValueError("Passing a list of prompts is not yet supported. This may be supported in the future.") if num_videos_per_prompt != 1: raise ValueError( From dcb6dd9b7a6c7ddd6875506f40597c0976fd02c5 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 24 Sep 2025 19:03:25 +0530 Subject: [PATCH 13/18] Context Parallel w/ Ring & Ulysses & Unified Attention (#11941) * update * update * add coauthor Co-Authored-By: Dhruv Nair * improve test * handle ip adapter params correctly * fix chroma qkv fusion test * fix fastercache implementation * fix more tests * fight more tests * add back set_attention_backend * update * update * make style * make fix-copies * make ip adapter processor compatible with attention dispatcher * refactor chroma as well * remove rmsnorm assert * minify and deprecate npu/xla processors * update * refactor * refactor; support flash attention 2 with cp * fix * support sage attention with cp * make torch compile compatible * update * refactor * update * refactor * refactor * add ulysses backward * try to make dreambooth script work; accelerator backward not playing well * Revert "try to make dreambooth script work; accelerator backward not playing well" This reverts commit 768d0ea6fa6a305d12df1feda2afae3ec80aa449. * workaround compilation problems with triton when doing all-to-all * support wan * handle backward correctly * support qwen * support ltx * make fix-copies * Update src/diffusers/models/modeling_utils.py Co-authored-by: Dhruv Nair * apply review suggestions * update docs * add explanation * make fix-copies * add docstrings * support passing parallel_config to from_pretrained * apply review suggestions * make style * update * Update docs/source/en/api/parallel.md Co-authored-by: Aryan * up --------- Co-authored-by: Dhruv Nair Co-authored-by: sayakpaul --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/parallel.md | 24 + src/diffusers/__init__.py | 4 + src/diffusers/hooks/__init__.py | 1 + src/diffusers/hooks/context_parallel.py | 297 ++++++ src/diffusers/models/__init__.py | 2 + src/diffusers/models/_modeling_parallel.py | 241 +++++ src/diffusers/models/attention_dispatch.py | 987 +++++++++++++++--- src/diffusers/models/modeling_utils.py | 78 +- .../models/transformers/transformer_bria.py | 8 +- .../models/transformers/transformer_flux.py | 21 +- .../models/transformers/transformer_ltx.py | 15 + .../transformers/transformer_qwenimage.py | 15 + .../transformers/transformer_skyreels_v2.py | 3 + .../models/transformers/transformer_wan.py | 17 + src/diffusers/utils/dummy_pt_objects.py | 30 + 16 files changed, 1571 insertions(+), 174 deletions(-) create mode 100644 docs/source/en/api/parallel.md create mode 100644 src/diffusers/hooks/context_parallel.py create mode 100644 src/diffusers/models/_modeling_parallel.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4879a7bf04..07408cc30e 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -70,6 +70,8 @@ title: Reduce memory usage - local: optimization/speed-memory-optims title: Compiling and offloading quantized models + - local: api/parallel + title: Parallel inference - title: Community optimizations sections: - local: optimization/pruna diff --git a/docs/source/en/api/parallel.md b/docs/source/en/api/parallel.md new file mode 100644 index 0000000000..e38ffe571e --- /dev/null +++ b/docs/source/en/api/parallel.md @@ -0,0 +1,24 @@ + + +# Parallelism + +Parallelism strategies help speed up diffusion transformers by distributing computations across multiple devices, allowing for faster inference/training times. + +## ParallelConfig + +[[autodoc]] ParallelConfig + +## ContextParallelConfig + +[[autodoc]] ContextParallelConfig + +[[autodoc]] hooks.apply_context_parallel diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 741fcd14f2..8867250ded 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -202,6 +202,7 @@ else: "CogView4Transformer2DModel", "ConsisIDTransformer3DModel", "ConsistencyDecoderVAE", + "ContextParallelConfig", "ControlNetModel", "ControlNetUnionModel", "ControlNetXSAdapter", @@ -229,6 +230,7 @@ else: "MultiAdapter", "MultiControlNetModel", "OmniGenTransformer2DModel", + "ParallelConfig", "PixArtTransformer2DModel", "PriorTransformer", "QwenImageControlNetModel", @@ -888,6 +890,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: CogView4Transformer2DModel, ConsisIDTransformer3DModel, ConsistencyDecoderVAE, + ContextParallelConfig, ControlNetModel, ControlNetUnionModel, ControlNetXSAdapter, @@ -915,6 +918,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: MultiAdapter, MultiControlNetModel, OmniGenTransformer2DModel, + ParallelConfig, PixArtTransformer2DModel, PriorTransformer, QwenImageControlNetModel, diff --git a/src/diffusers/hooks/__init__.py b/src/diffusers/hooks/__init__.py index 525a0747da..524a92ea99 100644 --- a/src/diffusers/hooks/__init__.py +++ b/src/diffusers/hooks/__init__.py @@ -16,6 +16,7 @@ from ..utils import is_torch_available if is_torch_available(): + from .context_parallel import apply_context_parallel from .faster_cache import FasterCacheConfig, apply_faster_cache from .first_block_cache import FirstBlockCacheConfig, apply_first_block_cache from .group_offloading import apply_group_offloading diff --git a/src/diffusers/hooks/context_parallel.py b/src/diffusers/hooks/context_parallel.py new file mode 100644 index 0000000000..83406d4969 --- /dev/null +++ b/src/diffusers/hooks/context_parallel.py @@ -0,0 +1,297 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import Dict, List, Type, Union + +import torch +import torch.distributed._functional_collectives as funcol + +from ..models._modeling_parallel import ( + ContextParallelConfig, + ContextParallelInput, + ContextParallelModelPlan, + ContextParallelOutput, +) +from ..utils import get_logger +from ..utils.torch_utils import unwrap_module +from .hooks import HookRegistry, ModelHook + + +logger = get_logger(__name__) # pylint: disable=invalid-name + +_CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE = "cp_input---{}" +_CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE = "cp_output---{}" + + +# TODO(aryan): consolidate with ._helpers.TransformerBlockMetadata +@dataclass +class ModuleForwardMetadata: + cached_parameter_indices: Dict[str, int] = None + _cls: Type = None + + def _get_parameter_from_args_kwargs(self, identifier: str, args=(), kwargs=None): + kwargs = kwargs or {} + + if identifier in kwargs: + return kwargs[identifier], True, None + + if self.cached_parameter_indices is not None: + index = self.cached_parameter_indices.get(identifier, None) + if index is None: + raise ValueError(f"Parameter '{identifier}' not found in cached indices.") + return args[index], False, index + + if self._cls is None: + raise ValueError("Model class is not set for metadata.") + + parameters = list(inspect.signature(self._cls.forward).parameters.keys()) + parameters = parameters[1:] # skip `self` + self.cached_parameter_indices = {param: i for i, param in enumerate(parameters)} + + if identifier not in self.cached_parameter_indices: + raise ValueError(f"Parameter '{identifier}' not found in function signature but was requested.") + + index = self.cached_parameter_indices[identifier] + + if index >= len(args): + raise ValueError(f"Expected {index} arguments but got {len(args)}.") + + return args[index], False, index + + +def apply_context_parallel( + module: torch.nn.Module, + parallel_config: ContextParallelConfig, + plan: Dict[str, ContextParallelModelPlan], +) -> None: + """Apply context parallel on a model.""" + logger.debug(f"Applying context parallel with CP mesh: {parallel_config._mesh} and plan: {plan}") + + for module_id, cp_model_plan in plan.items(): + submodule = _get_submodule_by_name(module, module_id) + if not isinstance(submodule, list): + submodule = [submodule] + + logger.debug(f"Applying ContextParallelHook to {module_id=} identifying a total of {len(submodule)} modules") + + for m in submodule: + if isinstance(cp_model_plan, dict): + hook = ContextParallelSplitHook(cp_model_plan, parallel_config) + hook_name = _CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE.format(module_id) + elif isinstance(cp_model_plan, (ContextParallelOutput, list, tuple)): + if isinstance(cp_model_plan, ContextParallelOutput): + cp_model_plan = [cp_model_plan] + if not all(isinstance(x, ContextParallelOutput) for x in cp_model_plan): + raise ValueError(f"Expected all elements of cp_model_plan to be CPOutput, but got {cp_model_plan}") + hook = ContextParallelGatherHook(cp_model_plan, parallel_config) + hook_name = _CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE.format(module_id) + else: + raise ValueError(f"Unsupported context parallel model plan type: {type(cp_model_plan)}") + registry = HookRegistry.check_if_exists_or_initialize(m) + registry.register_hook(hook, hook_name) + + +def remove_context_parallel(module: torch.nn.Module, plan: Dict[str, ContextParallelModelPlan]) -> None: + for module_id, cp_model_plan in plan.items(): + submodule = _get_submodule_by_name(module, module_id) + if not isinstance(submodule, list): + submodule = [submodule] + + for m in submodule: + registry = HookRegistry.check_if_exists_or_initialize(m) + if isinstance(cp_model_plan, dict): + hook_name = _CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE.format(module_id) + elif isinstance(cp_model_plan, (ContextParallelOutput, list, tuple)): + hook_name = _CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE.format(module_id) + else: + raise ValueError(f"Unsupported context parallel model plan type: {type(cp_model_plan)}") + registry.remove_hook(hook_name) + + +class ContextParallelSplitHook(ModelHook): + def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None: + super().__init__() + self.metadata = metadata + self.parallel_config = parallel_config + self.module_forward_metadata = None + + def initialize_hook(self, module): + cls = unwrap_module(module).__class__ + self.module_forward_metadata = ModuleForwardMetadata(_cls=cls) + return module + + def pre_forward(self, module, *args, **kwargs): + args_list = list(args) + + for name, cpm in self.metadata.items(): + if isinstance(cpm, ContextParallelInput) and cpm.split_output: + continue + + # Maybe the parameter was passed as a keyword argument + input_val, is_kwarg, index = self.module_forward_metadata._get_parameter_from_args_kwargs( + name, args_list, kwargs + ) + + if input_val is None: + continue + + # The input_val may be a tensor or list/tuple of tensors. In certain cases, user may specify to shard + # the output instead of input for a particular layer by setting split_output=True + if isinstance(input_val, torch.Tensor): + input_val = self._prepare_cp_input(input_val, cpm) + elif isinstance(input_val, (list, tuple)): + if len(input_val) != len(cpm): + raise ValueError( + f"Expected input model plan to have {len(input_val)} elements, but got {len(cpm)}." + ) + sharded_input_val = [] + for i, x in enumerate(input_val): + if torch.is_tensor(x) and not cpm[i].split_output: + x = self._prepare_cp_input(x, cpm[i]) + sharded_input_val.append(x) + input_val = sharded_input_val + else: + raise ValueError(f"Unsupported input type: {type(input_val)}") + + if is_kwarg: + kwargs[name] = input_val + elif index is not None and index < len(args_list): + args_list[index] = input_val + else: + raise ValueError( + f"An unexpected error occurred while processing the input '{name}'. Please open an " + f"issue at https://github.com/huggingface/diffusers/issues and provide a minimal reproducible " + f"example along with the full stack trace." + ) + + return tuple(args_list), kwargs + + def post_forward(self, module, output): + is_tensor = isinstance(output, torch.Tensor) + is_tensor_list = isinstance(output, (list, tuple)) and all(isinstance(x, torch.Tensor) for x in output) + + if not is_tensor and not is_tensor_list: + raise ValueError(f"Expected output to be a tensor or a list/tuple of tensors, but got {type(output)}.") + + output = [output] if is_tensor else list(output) + for index, cpm in self.metadata.items(): + if not isinstance(cpm, ContextParallelInput) or not cpm.split_output: + continue + if index >= len(output): + raise ValueError(f"Index {index} out of bounds for output of length {len(output)}.") + current_output = output[index] + current_output = self._prepare_cp_input(current_output, cpm) + output[index] = current_output + + return output[0] if is_tensor else tuple(output) + + def _prepare_cp_input(self, x: torch.Tensor, cp_input: ContextParallelInput) -> torch.Tensor: + if cp_input.expected_dims is not None and x.dim() != cp_input.expected_dims: + raise ValueError( + f"Expected input tensor to have {cp_input.expected_dims} dimensions, but got {x.dim()} dimensions." + ) + return EquipartitionSharder.shard(x, cp_input.split_dim, self.parallel_config._flattened_mesh) + + +class ContextParallelGatherHook(ModelHook): + def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None: + super().__init__() + self.metadata = metadata + self.parallel_config = parallel_config + + def post_forward(self, module, output): + is_tensor = isinstance(output, torch.Tensor) + + if is_tensor: + output = [output] + elif not (isinstance(output, (list, tuple)) and all(isinstance(x, torch.Tensor) for x in output)): + raise ValueError(f"Expected output to be a tensor or a list/tuple of tensors, but got {type(output)}.") + + output = list(output) + + if len(output) != len(self.metadata): + raise ValueError(f"Expected output to have {len(self.metadata)} elements, but got {len(output)}.") + + for i, cpm in enumerate(self.metadata): + if cpm is None: + continue + output[i] = EquipartitionSharder.unshard(output[i], cpm.gather_dim, self.parallel_config._flattened_mesh) + + return output[0] if is_tensor else tuple(output) + + +class AllGatherFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, tensor, dim, group): + ctx.dim = dim + ctx.group = group + ctx.world_size = torch.distributed.get_world_size(group) + ctx.rank = torch.distributed.get_rank(group) + return funcol.all_gather_tensor(tensor, dim, group=group) + + @staticmethod + def backward(ctx, grad_output): + grad_chunks = torch.chunk(grad_output, ctx.world_size, dim=ctx.dim) + return grad_chunks[ctx.rank], None, None + + +class EquipartitionSharder: + @classmethod + def shard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor: + # NOTE: the following assertion does not have to be true in general. We simply enforce it for now + # because the alternate case has not yet been tested/required for any model. + assert tensor.size()[dim] % mesh.size() == 0, ( + "Tensor size along dimension to be sharded must be divisible by mesh size" + ) + + # The following is not fullgraph compatible with Dynamo (fails in DeviceMesh.get_rank) + # return tensor.chunk(mesh.size(), dim=dim)[mesh.get_rank()] + + return tensor.chunk(mesh.size(), dim=dim)[torch.distributed.get_rank(mesh.get_group())] + + @classmethod + def unshard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor: + tensor = tensor.contiguous() + tensor = AllGatherFunction.apply(tensor, dim, mesh.get_group()) + return tensor + + +def _get_submodule_by_name(model: torch.nn.Module, name: str) -> Union[torch.nn.Module, List[torch.nn.Module]]: + if name.count("*") > 1: + raise ValueError("Wildcard '*' can only be used once in the name") + return _find_submodule_by_name(model, name) + + +def _find_submodule_by_name(model: torch.nn.Module, name: str) -> Union[torch.nn.Module, List[torch.nn.Module]]: + if name == "": + return model + first_atom, remaining_name = name.split(".", 1) if "." in name else (name, "") + if first_atom == "*": + if not isinstance(model, torch.nn.ModuleList): + raise ValueError("Wildcard '*' can only be used with ModuleList") + submodules = [] + for submodule in model: + subsubmodules = _find_submodule_by_name(submodule, remaining_name) + if not isinstance(subsubmodules, list): + subsubmodules = [subsubmodules] + submodules.extend(subsubmodules) + return submodules + else: + if hasattr(model, first_atom): + submodule = getattr(model, first_atom) + return _find_submodule_by_name(submodule, remaining_name) + else: + raise ValueError(f"'{first_atom}' is not a submodule of '{model.__class__.__name__}'") diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 49ac2a1c56..457f70448a 100755 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -25,6 +25,7 @@ from ..utils import ( _import_structure = {} if is_torch_available(): + _import_structure["_modeling_parallel"] = ["ContextParallelConfig", "ParallelConfig"] _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"] _import_structure["attention_dispatch"] = ["AttentionBackendName", "attention_backend"] _import_structure["auto_model"] = ["AutoModel"] @@ -119,6 +120,7 @@ if is_flax_available(): if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: if is_torch_available(): + from ._modeling_parallel import ContextParallelConfig, ParallelConfig from .adapter import MultiAdapter, T2IAdapter from .attention_dispatch import AttentionBackendName, attention_backend from .auto_model import AutoModel diff --git a/src/diffusers/models/_modeling_parallel.py b/src/diffusers/models/_modeling_parallel.py new file mode 100644 index 0000000000..2a1d2cc6ce --- /dev/null +++ b/src/diffusers/models/_modeling_parallel.py @@ -0,0 +1,241 @@ +# 🚨🚨🚨 Experimental parallelism support for Diffusers 🚨🚨🚨 +# Experimental changes are subject to change and APIs may break without warning. + +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union + +import torch + +from ..utils import get_logger + + +if TYPE_CHECKING: + pass + + +logger = get_logger(__name__) # pylint: disable=invalid-name + + +# TODO(aryan): add support for the following: +# - Unified Attention +# - More dispatcher attention backends +# - CFG/Data Parallel +# - Tensor Parallel + + +@dataclass +class ContextParallelConfig: + """ + Configuration for context parallelism. + + Args: + ring_degree (`int`, *optional*, defaults to `1`): + Number of devices to use for ring attention within a context parallel region. Must be a divisor of the + total number of devices in the context parallel mesh. + ulysses_degree (`int`, *optional*, defaults to `1`): + Number of devices to use for ulysses attention within a context parallel region. Must be a divisor of the + total number of devices in the context parallel mesh. + convert_to_fp32 (`bool`, *optional*, defaults to `True`): + Whether to convert output and LSE to float32 for ring attention numerical stability. + rotate_method (`str`, *optional*, defaults to `"allgather"`): + Method to use for rotating key/value states across devices in ring attention. Currently, only `"allgather"` + is supported. + + """ + + ring_degree: Optional[int] = None + ulysses_degree: Optional[int] = None + convert_to_fp32: bool = True + # TODO: support alltoall + rotate_method: Literal["allgather", "alltoall"] = "allgather" + + _rank: int = None + _world_size: int = None + _device: torch.device = None + _mesh: torch.distributed.device_mesh.DeviceMesh = None + _flattened_mesh: torch.distributed.device_mesh.DeviceMesh = None + _ring_mesh: torch.distributed.device_mesh.DeviceMesh = None + _ulysses_mesh: torch.distributed.device_mesh.DeviceMesh = None + _ring_local_rank: int = None + _ulysses_local_rank: int = None + + def __post_init__(self): + if self.ring_degree is None: + self.ring_degree = 1 + if self.ulysses_degree is None: + self.ulysses_degree = 1 + + def setup(self, rank: int, world_size: int, device: torch.device, mesh: torch.distributed.device_mesh.DeviceMesh): + self._rank = rank + self._world_size = world_size + self._device = device + self._mesh = mesh + if self.ring_degree is None: + self.ring_degree = 1 + if self.ulysses_degree is None: + self.ulysses_degree = 1 + if self.rotate_method != "allgather": + raise NotImplementedError( + f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}." + ) + if self._flattened_mesh is None: + self._flattened_mesh = self._mesh._flatten() + if self._ring_mesh is None: + self._ring_mesh = self._mesh["ring"] + if self._ulysses_mesh is None: + self._ulysses_mesh = self._mesh["ulysses"] + if self._ring_local_rank is None: + self._ring_local_rank = self._ring_mesh.get_local_rank() + if self._ulysses_local_rank is None: + self._ulysses_local_rank = self._ulysses_mesh.get_local_rank() + + +@dataclass +class ParallelConfig: + """ + Configuration for applying different parallelisms. + + Args: + context_parallel_config (`ContextParallelConfig`, *optional*): + Configuration for context parallelism. + """ + + context_parallel_config: Optional[ContextParallelConfig] = None + + _rank: int = None + _world_size: int = None + _device: torch.device = None + _cp_mesh: torch.distributed.device_mesh.DeviceMesh = None + + def setup( + self, + rank: int, + world_size: int, + device: torch.device, + *, + cp_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None, + ): + self._rank = rank + self._world_size = world_size + self._device = device + self._cp_mesh = cp_mesh + if self.context_parallel_config is not None: + self.context_parallel_config.setup(rank, world_size, device, cp_mesh) + + +@dataclass(frozen=True) +class ContextParallelInput: + """ + Configuration for splitting an input tensor across context parallel region. + + Args: + split_dim (`int`): + The dimension along which to split the tensor. + expected_dims (`int`, *optional*): + The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the + tensor has the expected number of dimensions before splitting. + split_output (`bool`, *optional*, defaults to `False`): + Whether to split the output tensor of the layer along the given `split_dim` instead of the input tensor. + This is useful for layers whose outputs should be split after it does some preprocessing on the inputs (ex: + RoPE). + """ + + split_dim: int + expected_dims: Optional[int] = None + split_output: bool = False + + def __repr__(self): + return f"ContextParallelInput(split_dim={self.split_dim}, expected_dims={self.expected_dims}, split_output={self.split_output})" + + +@dataclass(frozen=True) +class ContextParallelOutput: + """ + Configuration for gathering an output tensor across context parallel region. + + Args: + gather_dim (`int`): + The dimension along which to gather the tensor. + expected_dims (`int`, *optional*): + The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the + tensor has the expected number of dimensions before gathering. + """ + + gather_dim: int + expected_dims: Optional[int] = None + + def __repr__(self): + return f"ContextParallelOutput(gather_dim={self.gather_dim}, expected_dims={self.expected_dims})" + + +# A dictionary where keys denote the input to be split across context parallel region, and the +# value denotes the sharding configuration. +# If the key is a string, it denotes the name of the parameter in the forward function. +# If the key is an integer, split_output must be set to True, and it denotes the index of the output +# to be split across context parallel region. +ContextParallelInputType = Dict[ + Union[str, int], Union[ContextParallelInput, List[ContextParallelInput], Tuple[ContextParallelInput, ...]] +] + +# A dictionary where keys denote the output to be gathered across context parallel region, and the +# value denotes the gathering configuration. +ContextParallelOutputType = Union[ + ContextParallelOutput, List[ContextParallelOutput], Tuple[ContextParallelOutput, ...] +] + +# A dictionary where keys denote the module id, and the value denotes how the inputs/outputs of +# the module should be split/gathered across context parallel region. +ContextParallelModelPlan = Dict[str, Union[ContextParallelInputType, ContextParallelOutputType]] + + +# Example of a ContextParallelModelPlan (QwenImageTransformer2DModel): +# +# Each model should define a _cp_plan attribute that contains information on how to shard/gather +# tensors at different stages of the forward: +# +# ```python +# _cp_plan = { +# "": { +# "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), +# "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), +# "encoder_hidden_states_mask": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False), +# }, +# "pos_embed": { +# 0: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True), +# 1: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True), +# }, +# "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3), +# } +# ``` +# +# The dictionary is a set of module names mapped to their respective CP plan. The inputs/outputs of layers will be +# split/gathered according to this at the respective module level. Here, the following happens: +# - "": +# we specify that we want to split the various inputs across the sequence dim in the pre-forward hook (i.e. before +# the actual forward logic of the QwenImageTransformer2DModel is run, we will splitthe inputs) +# - "pos_embed": +# we specify that we want to split the outputs of the RoPE layer. Since there are two outputs (imag & text freqs), +# we can individually specify how they should be split +# - "proj_out": +# before returning to the user, we gather the entire sequence on each rank in the post-forward hook (after the linear +# layer forward has run). +# +# ContextParallelInput: +# specifies how to split the input tensor in the pre-forward or post-forward hook of the layer it is attached to +# +# ContextParallelOutput: +# specifies how to gather the input tensor in the post-forward hook in the layer it is attached to diff --git a/src/diffusers/models/attention_dispatch.py b/src/diffusers/models/attention_dispatch.py index f71be7c8ec..0a2ad68123 100644 --- a/src/diffusers/models/attention_dispatch.py +++ b/src/diffusers/models/attention_dispatch.py @@ -17,9 +17,10 @@ import functools import inspect import math from enum import Enum -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union import torch +import torch.distributed._functional_collectives as funcol from ..utils import ( get_logger, @@ -39,6 +40,9 @@ from ..utils import ( from ..utils.constants import DIFFUSERS_ATTN_BACKEND, DIFFUSERS_ATTN_CHECKS, DIFFUSERS_ENABLE_HUB_KERNELS +if TYPE_CHECKING: + from ._modeling_parallel import ParallelConfig + _REQUIRED_FLASH_VERSION = "2.6.3" _REQUIRED_SAGE_VERSION = "2.1.1" _REQUIRED_FLEX_VERSION = "2.5.0" @@ -56,9 +60,12 @@ _CAN_USE_XFORMERS_ATTN = is_xformers_available() and is_xformers_version(">=", _ if _CAN_USE_FLASH_ATTN: from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.flash_attn_interface import _wrapped_flash_attn_backward, _wrapped_flash_attn_forward else: flash_attn_func = None flash_attn_varlen_func = None + _wrapped_flash_attn_backward = None + _wrapped_flash_attn_forward = None if _CAN_USE_FLASH_ATTN_3: @@ -197,17 +204,24 @@ class _AttentionBackendRegistry: _backends = {} _constraints = {} _supported_arg_names = {} + _supports_context_parallel = {} _active_backend = AttentionBackendName(DIFFUSERS_ATTN_BACKEND) _checks_enabled = DIFFUSERS_ATTN_CHECKS @classmethod - def register(cls, backend: AttentionBackendName, constraints: Optional[List[Callable]] = None): + def register( + cls, + backend: AttentionBackendName, + constraints: Optional[List[Callable]] = None, + supports_context_parallel: bool = False, + ): logger.debug(f"Registering attention backend: {backend} with constraints: {constraints}") def decorator(func): cls._backends[backend] = func cls._constraints[backend] = constraints or [] cls._supported_arg_names[backend] = set(inspect.signature(func).parameters.keys()) + cls._supports_context_parallel[backend] = supports_context_parallel return func return decorator @@ -220,6 +234,17 @@ class _AttentionBackendRegistry: def list_backends(cls): return list(cls._backends.keys()) + @classmethod + def _is_context_parallel_enabled( + cls, backend: AttentionBackendName, parallel_config: Optional["ParallelConfig"] + ) -> bool: + supports_context_parallel = backend in cls._supports_context_parallel + is_degree_greater_than_1 = parallel_config is not None and ( + parallel_config.context_parallel_config.ring_degree > 1 + or parallel_config.context_parallel_config.ulysses_degree > 1 + ) + return supports_context_parallel and is_degree_greater_than_1 + @contextlib.contextmanager def attention_backend(backend: Union[str, AttentionBackendName] = AttentionBackendName.NATIVE): @@ -253,6 +278,7 @@ def dispatch_attention_fn( attention_kwargs: Optional[Dict[str, Any]] = None, *, backend: Optional[AttentionBackendName] = None, + parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: attention_kwargs = attention_kwargs or {} @@ -264,6 +290,14 @@ def dispatch_attention_fn( backend_name = AttentionBackendName(backend) backend_fn = _AttentionBackendRegistry._backends.get(backend_name) + if parallel_config is not None and not _AttentionBackendRegistry._is_context_parallel_enabled( + backend_name, parallel_config + ): + raise ValueError( + f"Backend {backend_name} either does not support context parallelism or context parallelism " + f"was enabled with a world size of 1." + ) + kwargs = { "query": query, "key": key, @@ -273,6 +307,7 @@ def dispatch_attention_fn( "is_causal": is_causal, "scale": scale, **attention_kwargs, + "_parallel_config": parallel_config, } if is_torch_version(">=", "2.5.0"): kwargs["enable_gqa"] = enable_gqa @@ -521,22 +556,621 @@ def _flex_attention_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx): # Registrations are required for fullgraph tracing compatibility # TODO: this is only required because the beta release FA3 does not have it. There is a PR adding # this but it was never merged: https://github.com/Dao-AILab/flash-attention/pull/1590 - - -@_custom_op("flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda") -def _wrapped_flash_attn_3_original( - query: torch.Tensor, key: torch.Tensor, value: torch.Tensor +@_custom_op("_diffusers_flash_attn_3::_flash_attn_forward", mutates_args=(), device_types="cuda") +def _wrapped_flash_attn_3( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + softmax_scale: Optional[float] = None, + causal: bool = False, + qv: Optional[torch.Tensor] = None, + q_descale: Optional[torch.Tensor] = None, + k_descale: Optional[torch.Tensor] = None, + v_descale: Optional[torch.Tensor] = None, + attention_chunk: int = 0, + softcap: float = 0.0, + num_splits: int = 1, + pack_gqa: Optional[bool] = None, + deterministic: bool = False, + sm_margin: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor]: - out, lse = flash_attn_3_func(query, key, value) + # Hardcoded for now because pytorch does not support tuple/int type hints + window_size = (-1, -1) + out, lse, *_ = flash_attn_3_func( + q=q, + k=k, + v=v, + softmax_scale=softmax_scale, + causal=causal, + qv=qv, + q_descale=q_descale, + k_descale=k_descale, + v_descale=v_descale, + window_size=window_size, + attention_chunk=attention_chunk, + softcap=softcap, + num_splits=num_splits, + pack_gqa=pack_gqa, + deterministic=deterministic, + sm_margin=sm_margin, + ) lse = lse.permute(0, 2, 1) return out, lse -@_register_fake("flash_attn_3::_flash_attn_forward") -def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - batch_size, seq_len, num_heads, head_dim = query.shape +@_register_fake("_diffusers_flash_attn_3::_flash_attn_forward") +def _( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + softmax_scale: Optional[float] = None, + causal: bool = False, + qv: Optional[torch.Tensor] = None, + q_descale: Optional[torch.Tensor] = None, + k_descale: Optional[torch.Tensor] = None, + v_descale: Optional[torch.Tensor] = None, + attention_chunk: int = 0, + softcap: float = 0.0, + num_splits: int = 1, + pack_gqa: Optional[bool] = None, + deterministic: bool = False, + sm_margin: int = 0, +) -> Tuple[torch.Tensor, torch.Tensor]: + window_size = (-1, -1) # noqa: F841 + # A lot of the parameters here are not yet used in any way within diffusers. + # We can safely ignore for now and keep the fake op shape propagation simple. + batch_size, seq_len, num_heads, head_dim = q.shape lse_shape = (batch_size, seq_len, num_heads) - return torch.empty_like(query), query.new_empty(lse_shape) + return torch.empty_like(q), q.new_empty(lse_shape) + + +# ===== Helper functions to use attention backends with templated CP autograd functions ===== + + +# https://github.com/pytorch/pytorch/blob/8904ba638726f8c9a5aff5977c4aa76c9d2edfa6/aten/src/ATen/native/native_functions.yaml#L14958 +# forward declaration: +# aten::_scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0., bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) +def _cudnn_attention_forward_op( + ctx: torch.autograd.function.FunctionCtx, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: Optional[float] = None, + enable_gqa: bool = False, + return_lse: bool = False, + _save_ctx: bool = True, + _parallel_config: Optional["ParallelConfig"] = None, +): + if enable_gqa: + raise ValueError("`enable_gqa` is not yet supported for cuDNN attention.") + + tensors_to_save = () + + # Contiguous is a must here! Calling cuDNN backend with aten ops produces incorrect results + # if the input tensors are not contiguous. + query = query.transpose(1, 2).contiguous() + key = key.transpose(1, 2).contiguous() + value = value.transpose(1, 2).contiguous() + tensors_to_save += (query, key, value) + + out, lse, cum_seq_q, cum_seq_k, max_q, max_k, philox_seed, philox_offset, debug_attn_mask = ( + torch.ops.aten._scaled_dot_product_cudnn_attention( + query=query, + key=key, + value=value, + attn_bias=attn_mask, + compute_log_sumexp=return_lse, + dropout_p=dropout_p, + is_causal=is_causal, + return_debug_mask=False, + scale=scale, + ) + ) + + tensors_to_save += (out, lse, cum_seq_q, cum_seq_k, philox_seed, philox_offset) + if _save_ctx: + ctx.save_for_backward(*tensors_to_save) + ctx.dropout_p = dropout_p + ctx.is_causal = is_causal + ctx.scale = scale + ctx.attn_mask = attn_mask + ctx.max_q = max_q + ctx.max_k = max_k + + out = out.transpose(1, 2).contiguous() + if lse is not None: + lse = lse.transpose(1, 2).contiguous() + return (out, lse) if return_lse else out + + +# backward declaration: +# aten::_scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor attn_bias, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, *, float? scale=None) -> (Tensor, Tensor, Tensor) +def _cudnn_attention_backward_op( + ctx: torch.autograd.function.FunctionCtx, + grad_out: torch.Tensor, + *args, + **kwargs, +): + query, key, value, out, lse, cum_seq_q, cum_seq_k, philox_seed, philox_offset = ctx.saved_tensors + + grad_out = grad_out.transpose(1, 2).contiguous() + key = key.transpose(1, 2).contiguous() + value = value.transpose(1, 2).contiguous() + + # Cannot pass first 5 arguments as kwargs because: https://github.com/pytorch/pytorch/blob/d26ca5de058dbcf56ac52bb43e84dd98df2ace97/torch/_dynamo/variables/torch.py#L1341 + grad_query, grad_key, grad_value = torch.ops.aten._scaled_dot_product_cudnn_attention_backward( + grad_out, + query, + key, + value, + out, + logsumexp=lse, + philox_seed=philox_seed, + philox_offset=philox_offset, + attn_bias=ctx.attn_mask, + cum_seq_q=cum_seq_q, + cum_seq_k=cum_seq_k, + max_q=ctx.max_q, + max_k=ctx.max_k, + dropout_p=ctx.dropout_p, + is_causal=ctx.is_causal, + scale=ctx.scale, + ) + grad_query, grad_key, grad_value = (x.transpose(1, 2).contiguous() for x in (grad_query, grad_key, grad_value)) + + return grad_query, grad_key, grad_value + + +# Adapted from: https://github.com/Dao-AILab/flash-attention/blob/fd2fc9d85c8e54e5c20436465bca709bc1a6c5a1/flash_attn/flash_attn_interface.py#L807 +def _flash_attention_forward_op( + ctx: torch.autograd.function.FunctionCtx, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: Optional[float] = None, + enable_gqa: bool = False, + return_lse: bool = False, + _save_ctx: bool = True, + _parallel_config: Optional["ParallelConfig"] = None, +): + if attn_mask is not None: + raise ValueError("`attn_mask` is not yet supported for flash-attn 2.") + if enable_gqa: + raise ValueError("`enable_gqa` is not yet supported for flash-attn 2.") + + # Hardcoded for now + window_size = (-1, -1) + softcap = 0.0 + alibi_slopes = None + deterministic = False + grad_enabled = any(x.requires_grad for x in (query, key, value)) + + if scale is None: + scale = query.shape[-1] ** (-0.5) + + # flash-attn only returns LSE if dropout_p > 0. So, we need to workaround. + if grad_enabled or (_parallel_config is not None and _parallel_config.context_parallel_config._world_size > 1): + dropout_p = dropout_p if dropout_p > 0 else 1e-30 + + with torch.set_grad_enabled(grad_enabled): + out, lse, S_dmask, rng_state = _wrapped_flash_attn_forward( + query, + key, + value, + dropout_p, + scale, + is_causal, + window_size[0], + window_size[1], + softcap, + alibi_slopes, + return_lse, + ) + lse = lse.permute(0, 2, 1) + + if _save_ctx: + ctx.save_for_backward(query, key, value, out, lse, rng_state) + ctx.dropout_p = dropout_p + ctx.scale = scale + ctx.is_causal = is_causal + ctx.window_size = window_size + ctx.softcap = softcap + ctx.alibi_slopes = alibi_slopes + ctx.deterministic = deterministic + + return (out, lse) if return_lse else out + + +def _flash_attention_backward_op( + ctx: torch.autograd.function.FunctionCtx, + grad_out: torch.Tensor, + *args, + **kwargs, +): + query, key, value, out, lse, rng_state = ctx.saved_tensors + grad_query, grad_key, grad_value = torch.empty_like(query), torch.empty_like(key), torch.empty_like(value) + + lse_d = _wrapped_flash_attn_backward( # noqa: F841 + grad_out, + query, + key, + value, + out, + lse, + grad_query, + grad_key, + grad_value, + ctx.dropout_p, + ctx.scale, + ctx.is_causal, + ctx.window_size[0], + ctx.window_size[1], + ctx.softcap, + ctx.alibi_slopes, + ctx.deterministic, + rng_state, + ) + + # Head dimension may have been padded + grad_query = grad_query[..., : grad_out.shape[-1]] + grad_key = grad_key[..., : grad_out.shape[-1]] + grad_value = grad_value[..., : grad_out.shape[-1]] + + return grad_query, grad_key, grad_value + + +def _sage_attention_forward_op( + ctx: torch.autograd.function.FunctionCtx, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: Optional[float] = None, + enable_gqa: bool = False, + return_lse: bool = False, + _save_ctx: bool = True, + _parallel_config: Optional["ParallelConfig"] = None, +): + if attn_mask is not None: + raise ValueError("`attn_mask` is not yet supported for Sage attention.") + if dropout_p > 0.0: + raise ValueError("`dropout_p` is not yet supported for Sage attention.") + if enable_gqa: + raise ValueError("`enable_gqa` is not yet supported for Sage attention.") + + out = sageattn( + q=query, + k=key, + v=value, + tensor_layout="NHD", + is_causal=is_causal, + sm_scale=scale, + return_lse=return_lse, + ) + lse = None + if return_lse: + out, lse, *_ = out + lse = lse.permute(0, 2, 1) + + return (out, lse) if return_lse else out + + +def _sage_attention_backward_op( + ctx: torch.autograd.function.FunctionCtx, + grad_out: torch.Tensor, + *args, +): + raise NotImplementedError("Backward pass is not implemented for Sage attention.") + + +# ===== Context parallel ===== + + +# Reference: +# - https://github.com/pytorch/pytorch/blob/f58a680d09e13658a52c6ba05c63c15759846bcc/torch/distributed/_functional_collectives.py#L827 +# - https://github.com/pytorch/pytorch/blob/f58a680d09e13658a52c6ba05c63c15759846bcc/torch/distributed/_functional_collectives.py#L246 +# For fullgraph=True tracing compatibility (since FakeTensor does not have a `wait` method): +def _wait_tensor(tensor): + if isinstance(tensor, funcol.AsyncCollectiveTensor): + tensor = tensor.wait() + return tensor + + +def _all_to_all_single(x: torch.Tensor, group) -> torch.Tensor: + shape = x.shape + # HACK: We need to flatten because despite making tensors contiguous, torch single-file-ization + # to benchmark triton codegen fails somewhere: + # buf25 = torch.ops._c10d_functional.all_to_all_single.default(buf24, [1, 1], [1, 1], '3') + # ValueError: Tensors must be contiguous + x = x.flatten() + x = funcol.all_to_all_single(x, None, None, group) + x = x.reshape(shape) + x = _wait_tensor(x) + return x + + +class TemplatedRingAttention(torch.autograd.Function): + @staticmethod + def forward( + ctx: torch.autograd.function.FunctionCtx, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor], + dropout_p: float, + is_causal: bool, + scale: Optional[float], + enable_gqa: bool, + return_lse: bool, + forward_op, + backward_op, + _parallel_config: Optional["ParallelConfig"] = None, + ): + ring_mesh = _parallel_config.context_parallel_config._ring_mesh + rank = _parallel_config.context_parallel_config._ring_local_rank + world_size = _parallel_config.context_parallel_config.ring_degree + next_rank = (rank + 1) % world_size + prev_out = prev_lse = None + + ctx.forward_op = forward_op + ctx.backward_op = backward_op + ctx.q_shape = query.shape + ctx.kv_shape = key.shape + ctx._parallel_config = _parallel_config + + kv_buffer = torch.cat([key.flatten(), value.flatten()]).contiguous() + kv_buffer = funcol.all_gather_tensor(kv_buffer, gather_dim=0, group=ring_mesh.get_group()) + kv_buffer = kv_buffer.chunk(world_size) + + for i in range(world_size): + if i > 0: + kv = kv_buffer[next_rank] + key_numel = key.numel() + key = kv[:key_numel].reshape_as(key) + value = kv[key_numel:].reshape_as(value) + next_rank = (next_rank + 1) % world_size + + out, lse = forward_op( + ctx, + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + enable_gqa, + True, + _save_ctx=i == 0, + _parallel_config=_parallel_config, + ) + + if _parallel_config.context_parallel_config.convert_to_fp32: + out = out.to(torch.float32) + lse = lse.to(torch.float32) + + lse = lse.unsqueeze(-1) + if prev_out is not None: + out = prev_out - torch.nn.functional.sigmoid(lse - prev_lse) * (prev_out - out) + lse = prev_lse - torch.nn.functional.logsigmoid(prev_lse - lse) + prev_out = out + prev_lse = lse + + out = out.to(query.dtype) + lse = lse.squeeze(-1) + + return (out, lse) if return_lse else out + + @staticmethod + def backward( + ctx: torch.autograd.function.FunctionCtx, + grad_out: torch.Tensor, + *args, + ): + ring_mesh = ctx._parallel_config.context_parallel_config._ring_mesh + rank = ctx._parallel_config.context_parallel_config._ring_local_rank + world_size = ctx._parallel_config.context_parallel_config.ring_degree + next_rank = (rank + 1) % world_size + next_ranks = list(range(1, world_size)) + [0] + + accum_dtype = torch.float32 if ctx._parallel_config.context_parallel_config.convert_to_fp32 else grad_out.dtype + grad_query = torch.zeros(ctx.q_shape, dtype=accum_dtype, device=grad_out.device) + grad_key = torch.zeros(ctx.kv_shape, dtype=accum_dtype, device=grad_out.device) + grad_value = torch.zeros(ctx.kv_shape, dtype=accum_dtype, device=grad_out.device) + next_grad_kv = None + + query, key, value, *_ = ctx.saved_tensors + kv_buffer = torch.cat([key.flatten(), value.flatten()]).contiguous() + kv_buffer = funcol.all_gather_tensor(kv_buffer, gather_dim=0, group=ring_mesh.get_group()) + kv_buffer = kv_buffer.chunk(world_size) + + for i in range(world_size): + if i > 0: + kv = kv_buffer[next_rank] + key_numel = key.numel() + key = kv[:key_numel].reshape_as(key) + value = kv[key_numel:].reshape_as(value) + next_rank = (next_rank + 1) % world_size + + grad_query_op, grad_key_op, grad_value_op, *_ = ctx.backward_op(ctx, grad_out) + + if i > 0: + grad_kv_buffer = _wait_tensor(next_grad_kv) + grad_key_numel = grad_key.numel() + grad_key = grad_kv_buffer[:grad_key_numel].reshape_as(grad_key) + grad_value = grad_kv_buffer[grad_key_numel:].reshape_as(grad_value) + + grad_query += grad_query_op + grad_key += grad_key_op + grad_value += grad_value_op + + if i < world_size - 1: + grad_kv_buffer = torch.cat([grad_key.flatten(), grad_value.flatten()]).contiguous() + next_grad_kv = funcol.permute_tensor(grad_kv_buffer, next_ranks, group=ring_mesh.get_group()) + + grad_query, grad_key, grad_value = (x.to(grad_out.dtype) for x in (grad_query, grad_key, grad_value)) + + return grad_query, grad_key, grad_value, None, None, None, None, None, None, None, None + + +class TemplatedUlyssesAttention(torch.autograd.Function): + @staticmethod + def forward( + ctx: torch.autograd.function.FunctionCtx, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor], + dropout_p: float, + is_causal: bool, + scale: Optional[float], + enable_gqa: bool, + return_lse: bool, + forward_op, + backward_op, + _parallel_config: Optional["ParallelConfig"] = None, + ): + ulysses_mesh = _parallel_config.context_parallel_config._ulysses_mesh + world_size = _parallel_config.context_parallel_config.ulysses_degree + group = ulysses_mesh.get_group() + + ctx.forward_op = forward_op + ctx.backward_op = backward_op + ctx._parallel_config = _parallel_config + + B, S_Q_LOCAL, H, D = query.shape + _, S_KV_LOCAL, _, _ = key.shape + H_LOCAL = H // world_size + query = query.reshape(B, S_Q_LOCAL, world_size, H_LOCAL, D).permute(2, 1, 0, 3, 4).contiguous() + key = key.reshape(B, S_KV_LOCAL, world_size, H_LOCAL, D).permute(2, 1, 0, 3, 4).contiguous() + value = value.reshape(B, S_KV_LOCAL, world_size, H_LOCAL, D).permute(2, 1, 0, 3, 4).contiguous() + query, key, value = (_all_to_all_single(x, group) for x in (query, key, value)) + query, key, value = (x.flatten(0, 1).permute(1, 0, 2, 3).contiguous() for x in (query, key, value)) + + out = forward_op( + ctx, + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + enable_gqa, + return_lse, + _save_ctx=True, + _parallel_config=_parallel_config, + ) + if return_lse: + out, lse, *_ = out + + out = out.reshape(B, world_size, S_Q_LOCAL, H_LOCAL, D).permute(1, 3, 0, 2, 4).contiguous() + out = _all_to_all_single(out, group) + out = out.flatten(0, 1).permute(1, 2, 0, 3).contiguous() + + if return_lse: + lse = lse.reshape(B, world_size, S_Q_LOCAL, H_LOCAL).permute(1, 3, 0, 2).contiguous() + lse = _all_to_all_single(lse, group) + lse = lse.flatten(0, 1).permute(1, 2, 0).contiguous() + else: + lse = None + + return (out, lse) if return_lse else out + + @staticmethod + def backward( + ctx: torch.autograd.function.FunctionCtx, + grad_out: torch.Tensor, + *args, + ): + ulysses_mesh = ctx._parallel_config.context_parallel_config._ulysses_mesh + world_size = ctx._parallel_config.context_parallel_config.ulysses_degree + group = ulysses_mesh.get_group() + + B, S_LOCAL, H, D = grad_out.shape + H_LOCAL = H // world_size + + grad_out = grad_out.reshape(B, S_LOCAL, world_size, H_LOCAL, D).permute(2, 1, 0, 3, 4).contiguous() + grad_out = _all_to_all_single(grad_out, group) + grad_out = grad_out.flatten(0, 1).permute(1, 0, 2, 3).contiguous() + + grad_query_op, grad_key_op, grad_value_op, *_ = ctx.backward_op(ctx, grad_out) + + grad_query, grad_key, grad_value = ( + x.reshape(B, world_size, S_LOCAL, H_LOCAL, D).permute(1, 3, 0, 2, 4).contiguous() + for x in (grad_query_op, grad_key_op, grad_value_op) + ) + grad_query, grad_key, grad_value = (_all_to_all_single(x, group) for x in (grad_query, grad_key, grad_value)) + grad_query, grad_key, grad_value = ( + x.flatten(0, 1).permute(1, 2, 0, 3).contiguous() for x in (grad_query, grad_key, grad_value) + ) + + return grad_query, grad_key, grad_value, None, None, None, None, None, None, None, None + + +def _templated_context_parallel_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + dropout_p: float = 0.0, + is_causal: bool = False, + scale: Optional[float] = None, + enable_gqa: bool = False, + return_lse: bool = False, + *, + forward_op, + backward_op, + _parallel_config: Optional["ParallelConfig"] = None, +): + if attn_mask is not None: + raise ValueError("Attention mask is not yet supported for templated attention.") + if is_causal: + raise ValueError("Causal attention is not yet supported for templated attention.") + if enable_gqa: + raise ValueError("GQA is not yet supported for templated attention.") + + # TODO: add support for unified attention with ring/ulysses degree both being > 1 + if _parallel_config.context_parallel_config.ring_degree > 1: + return TemplatedRingAttention.apply( + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + enable_gqa, + return_lse, + forward_op, + backward_op, + _parallel_config, + ) + elif _parallel_config.context_parallel_config.ulysses_degree > 1: + return TemplatedUlyssesAttention.apply( + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + enable_gqa, + return_lse, + forward_op, + backward_op, + _parallel_config, + ) + else: + raise ValueError("Reaching this branch of code is unexpected. Please report a bug.") # ===== Attention backends ===== @@ -545,34 +1179,50 @@ def _(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> Tuple[torc @_AttentionBackendRegistry.register( AttentionBackendName.FLASH, constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape], + supports_context_parallel=True, ) def _flash_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, dropout_p: float = 0.0, - scale: Optional[float] = None, is_causal: bool = False, - window_size: Tuple[int, int] = (-1, -1), - softcap: float = 0.0, - alibi_slopes: Optional[torch.Tensor] = None, - deterministic: bool = False, - return_attn_probs: bool = False, + scale: Optional[float] = None, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: - out = flash_attn_func( - q=query, - k=key, - v=value, - dropout_p=dropout_p, - softmax_scale=scale, - causal=is_causal, - window_size=window_size, - softcap=softcap, - alibi_slopes=alibi_slopes, - deterministic=deterministic, - return_attn_probs=return_attn_probs, - ) - return out + lse = None + if _parallel_config is None: + out = flash_attn_func( + q=query, + k=key, + v=value, + dropout_p=dropout_p, + softmax_scale=scale, + causal=is_causal, + return_attn_probs=return_lse, + ) + if return_lse: + out, lse, *_ = out + else: + out = _templated_context_parallel_attention( + query, + key, + value, + None, + dropout_p, + is_causal, + scale, + False, + return_lse, + forward_op=_flash_attention_forward_op, + backward_op=_flash_attention_backward_op, + _parallel_config=_parallel_config, + ) + if return_lse: + out, lse = out + + return (out, lse) if return_lse else out @_AttentionBackendRegistry.register( @@ -583,19 +1233,12 @@ def _flash_varlen_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - cu_seqlens_q: Optional[torch.Tensor] = None, - cu_seqlens_k: Optional[torch.Tensor] = None, - max_seqlen_q: Optional[int] = None, - max_seqlen_k: Optional[int] = None, + attn_mask: Optional[torch.Tensor] = None, dropout_p: float = 0.0, scale: Optional[float] = None, is_causal: bool = False, - window_size: Tuple[int, int] = (-1, -1), - softcap: float = 0.0, - alibi_slopes: Optional[torch.Tensor] = None, - deterministic: bool = False, - return_attn_probs: bool = False, - attn_mask: Optional[torch.Tensor] = None, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: batch_size, seq_len_q, _, _ = query.shape _, seq_len_kv, _, _ = key.shape @@ -603,16 +1246,11 @@ def _flash_varlen_attention( if attn_mask is not None: attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv) - if any(x is None for x in (cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)): - (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( - _prepare_for_flash_attn_or_sage_varlen( - batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device - ) + (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( + _prepare_for_flash_attn_or_sage_varlen( + batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device ) - else: - seqlens_k = torch.full((batch_size,), max_seqlen_k, dtype=torch.int32, device=query.device) - cu_seqlens_q = cu_seqlens_q.to(dtype=torch.int32, device=query.device) - cu_seqlens_k = cu_seqlens_k.to(dtype=torch.int32, device=query.device) + ) key_valid, value_valid = [], [] for b in range(batch_size): @@ -635,11 +1273,7 @@ def _flash_varlen_attention( dropout_p=dropout_p, softmax_scale=scale, causal=is_causal, - window_size=window_size, - softcap=softcap, - alibi_slopes=alibi_slopes, - deterministic=deterministic, - return_attn_probs=return_attn_probs, + return_attn_probs=return_lse, ) out = out.unflatten(0, (batch_size, -1)) @@ -656,30 +1290,17 @@ def _flash_attention_3( value: torch.Tensor, scale: Optional[float] = None, is_causal: bool = False, - window_size: Tuple[int, int] = (-1, -1), - softcap: float = 0.0, - deterministic: bool = False, - return_attn_probs: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: - out, lse, *_ = flash_attn_3_func( + out, lse = _wrapped_flash_attn_3( q=query, k=key, v=value, softmax_scale=scale, causal=is_causal, - qv=None, - q_descale=None, - k_descale=None, - v_descale=None, - window_size=window_size, - attention_chunk=0, - softcap=softcap, - num_splits=1, - pack_gqa=None, - deterministic=deterministic, - sm_margin=0, ) - return (out, lse) if return_attn_probs else out + return (out, lse) if return_lse else out @_AttentionBackendRegistry.register( @@ -696,6 +1317,7 @@ def _flash_attention_3_hub( softcap: float = 0.0, deterministic: bool = False, return_attn_probs: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: out = flash_attn_3_func_hub( q=query, @@ -728,17 +1350,11 @@ def _flash_varlen_attention_3( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - cu_seqlens_q: Optional[torch.Tensor] = None, - cu_seqlens_k: Optional[torch.Tensor] = None, - max_seqlen_q: Optional[int] = None, - max_seqlen_k: Optional[int] = None, + attn_mask: Optional[torch.Tensor] = None, scale: Optional[float] = None, is_causal: bool = False, - window_size: Tuple[int, int] = (-1, -1), - softcap: float = 0.0, - deterministic: bool = False, - return_attn_probs: bool = False, - attn_mask: Optional[torch.Tensor] = None, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: batch_size, seq_len_q, _, _ = query.shape _, seq_len_kv, _, _ = key.shape @@ -746,16 +1362,11 @@ def _flash_varlen_attention_3( if attn_mask is not None: attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv) - if any(x is None for x in (cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)): - (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( - _prepare_for_flash_attn_or_sage_varlen( - batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device - ) + (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( + _prepare_for_flash_attn_or_sage_varlen( + batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device ) - else: - seqlens_k = torch.full((batch_size,), max_seqlen_k, dtype=torch.int32, device=query.device) - cu_seqlens_q = cu_seqlens_q.to(dtype=torch.int32, device=query.device) - cu_seqlens_k = cu_seqlens_k.to(dtype=torch.int32, device=query.device) + ) key_valid, value_valid = [], [] for b in range(batch_size): @@ -775,24 +1386,12 @@ def _flash_varlen_attention_3( cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_q, max_seqlen_k=max_seqlen_k, - seqused_q=None, - seqused_k=None, softmax_scale=scale, causal=is_causal, - qv=None, - q_descale=None, - k_descale=None, - v_descale=None, - window_size=window_size, - softcap=softcap, - num_splits=1, - pack_gqa=None, - deterministic=deterministic, - sm_margin=0, ) out = out.unflatten(0, (batch_size, -1)) - return (out, lse) if return_attn_probs else out + return (out, lse) if return_lse else out @_AttentionBackendRegistry.register( @@ -808,7 +1407,7 @@ def _native_flex_attention( scale: Optional[float] = None, enable_gqa: bool = False, return_lse: bool = False, - kernel_options: Optional[Dict[str, Any]] = None, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: # TODO: should we LRU cache the block mask creation? score_mod = None @@ -853,7 +1452,6 @@ def _native_flex_attention( scale=scale, enable_gqa=enable_gqa, return_lse=return_lse, - kernel_options=kernel_options, ) out = out.permute(0, 2, 1, 3) return out @@ -872,7 +1470,11 @@ def _native_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("Native attention backend does not support setting `return_lse=True`.") query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) out = torch.nn.functional.scaled_dot_product_attention( query=query, @@ -891,6 +1493,7 @@ def _native_attention( @_AttentionBackendRegistry.register( AttentionBackendName._NATIVE_CUDNN, constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape], + supports_context_parallel=True, ) def _native_cudnn_attention( query: torch.Tensor, @@ -901,21 +1504,43 @@ def _native_cudnn_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: - query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) - with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION): - out = torch.nn.functional.scaled_dot_product_attention( - query=query, - key=key, - value=value, - attn_mask=attn_mask, - dropout_p=dropout_p, - is_causal=is_causal, - scale=scale, - enable_gqa=enable_gqa, + lse = None + if _parallel_config is None and not return_lse: + query, key, value = (x.permute(0, 2, 1, 3).contiguous() for x in (query, key, value)) + with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION): + out = torch.nn.functional.scaled_dot_product_attention( + query=query, + key=key, + value=value, + attn_mask=attn_mask, + dropout_p=dropout_p, + is_causal=is_causal, + scale=scale, + enable_gqa=enable_gqa, + ) + out = out.permute(0, 2, 1, 3) + else: + out = _templated_context_parallel_attention( + query, + key, + value, + attn_mask, + dropout_p, + is_causal, + scale, + enable_gqa, + return_lse, + forward_op=_cudnn_attention_forward_op, + backward_op=_cudnn_attention_backward_op, + _parallel_config=_parallel_config, ) - out = out.permute(0, 2, 1, 3) - return out + if return_lse: + out, lse = out + + return (out, lse) if return_lse else out @_AttentionBackendRegistry.register( @@ -931,7 +1556,11 @@ def _native_efficient_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("Native efficient attention backend does not support setting `return_lse=True`.") query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION): out = torch.nn.functional.scaled_dot_product_attention( @@ -960,7 +1589,11 @@ def _native_flash_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("Native flash attention backend does not support setting `return_lse=True`.") query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION): out = torch.nn.functional.scaled_dot_product_attention( @@ -990,7 +1623,11 @@ def _native_math_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("Native math attention backend does not support setting `return_lse=True`.") query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH): out = torch.nn.functional.scaled_dot_product_attention( @@ -1017,7 +1654,11 @@ def _native_npu_attention( value: torch.Tensor, dropout_p: float = 0.0, scale: Optional[float] = None, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("NPU attention backend does not support setting `return_lse=True`.") query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value)) out = npu_fusion_attention( query, @@ -1047,7 +1688,11 @@ def _native_xla_attention( key: torch.Tensor, value: torch.Tensor, is_causal: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("XLA attention backend does not support setting `return_lse=True`.") query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value)) query = query / math.sqrt(query.shape[-1]) out = xla_flash_attention( @@ -1063,6 +1708,7 @@ def _native_xla_attention( @_AttentionBackendRegistry.register( AttentionBackendName.SAGE, constraints=[_check_device_cuda, _check_qkv_dtype_bf16_or_fp16, _check_shape], + supports_context_parallel=True, ) def _sage_attention( query: torch.Tensor, @@ -1071,16 +1717,40 @@ def _sage_attention( is_causal: bool = False, scale: Optional[float] = None, return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: - return sageattn( - q=query, - k=key, - v=value, - tensor_layout="NHD", - is_causal=is_causal, - sm_scale=scale, - return_lse=return_lse, - ) + lse = None + if _parallel_config is None: + out = sageattn( + q=query, + k=key, + v=value, + tensor_layout="NHD", + is_causal=is_causal, + sm_scale=scale, + return_lse=return_lse, + ) + if return_lse: + out, lse, *_ = out + else: + out = _templated_context_parallel_attention( + query, + key, + value, + None, + 0.0, + is_causal, + scale, + False, + return_lse, + forward_op=_sage_attention_forward_op, + backward_op=_sage_attention_backward_op, + _parallel_config=_parallel_config, + ) + if return_lse: + out, lse = out + + return (out, lse) if return_lse else out @_AttentionBackendRegistry.register( @@ -1091,31 +1761,26 @@ def _sage_varlen_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - cu_seqlens_q: Optional[torch.Tensor] = None, - cu_seqlens_k: Optional[torch.Tensor] = None, - max_seqlen_q: Optional[int] = None, - max_seqlen_k: Optional[int] = None, + attn_mask: Optional[torch.Tensor] = None, is_causal: bool = False, scale: Optional[float] = None, - smooth_k: bool = True, - attn_mask: Optional[torch.Tensor] = None, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("Sage varlen backend does not support setting `return_lse=True`.") + batch_size, seq_len_q, _, _ = query.shape _, seq_len_kv, _, _ = key.shape if attn_mask is not None: attn_mask = _normalize_attn_mask(attn_mask, batch_size, seq_len_kv) - if any(x is None for x in (cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)): - (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( - _prepare_for_flash_attn_or_sage_varlen( - batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device - ) + (_, seqlens_k), (cu_seqlens_q, cu_seqlens_k), (max_seqlen_q, max_seqlen_k) = ( + _prepare_for_flash_attn_or_sage_varlen( + batch_size, seq_len_q, seq_len_kv, attn_mask=attn_mask, device=query.device ) - else: - seqlens_k = torch.full((batch_size,), max_seqlen_k, dtype=torch.int32, device=query.device) - cu_seqlens_q = cu_seqlens_q.to(dtype=torch.int32, device=query.device) - cu_seqlens_k = cu_seqlens_k.to(dtype=torch.int32, device=query.device) + ) key_valid, value_valid = [], [] for b in range(batch_size): @@ -1137,7 +1802,6 @@ def _sage_varlen_attention( max_seqlen_k=max_seqlen_k, is_causal=is_causal, sm_scale=scale, - smooth_k=smooth_k, ) out = out.unflatten(0, (batch_size, -1)) @@ -1154,11 +1818,8 @@ def _sage_qk_int8_pv_fp8_cuda_attention( value: torch.Tensor, is_causal: bool = False, scale: Optional[float] = None, - qk_quant_gran: _SAGE_ATTENTION_QK_QUANT_GRAN = "per_thread", - pv_accum_dtype: _SAGE_ATTENTION_PV_ACCUM_DTYPE = "fp32+fp32", - smooth_k: bool = True, - smooth_v: bool = False, return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: return sageattn_qk_int8_pv_fp8_cuda( q=query, @@ -1166,11 +1827,7 @@ def _sage_qk_int8_pv_fp8_cuda_attention( v=value, tensor_layout="NHD", is_causal=is_causal, - qk_quant_gran=qk_quant_gran, sm_scale=scale, - pv_accum_dtype=pv_accum_dtype, - smooth_k=smooth_k, - smooth_v=smooth_v, return_lse=return_lse, ) @@ -1185,10 +1842,8 @@ def _sage_qk_int8_pv_fp8_cuda_sm90_attention( value: torch.Tensor, is_causal: bool = False, scale: Optional[float] = None, - qk_quant_gran: _SAGE_ATTENTION_QK_QUANT_GRAN = "per_thread", - pv_accum_dtype: _SAGE_ATTENTION_PV_ACCUM_DTYPE = "fp32+fp32", - smooth_k: bool = True, return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: return sageattn_qk_int8_pv_fp8_cuda_sm90( q=query, @@ -1196,10 +1851,7 @@ def _sage_qk_int8_pv_fp8_cuda_sm90_attention( v=value, tensor_layout="NHD", is_causal=is_causal, - qk_quant_gran=qk_quant_gran, sm_scale=scale, - pv_accum_dtype=pv_accum_dtype, - smooth_k=smooth_k, return_lse=return_lse, ) @@ -1214,11 +1866,8 @@ def _sage_qk_int8_pv_fp16_cuda_attention( value: torch.Tensor, is_causal: bool = False, scale: Optional[float] = None, - qk_quant_gran: _SAGE_ATTENTION_QK_QUANT_GRAN = "per_thread", - pv_accum_dtype: _SAGE_ATTENTION_PV_ACCUM_DTYPE = "fp32", - smooth_k: bool = True, - smooth_v: bool = False, return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: return sageattn_qk_int8_pv_fp16_cuda( q=query, @@ -1226,11 +1875,7 @@ def _sage_qk_int8_pv_fp16_cuda_attention( v=value, tensor_layout="NHD", is_causal=is_causal, - qk_quant_gran=qk_quant_gran, sm_scale=scale, - pv_accum_dtype=pv_accum_dtype, - smooth_k=smooth_k, - smooth_v=smooth_v, return_lse=return_lse, ) @@ -1245,19 +1890,16 @@ def _sage_qk_int8_pv_fp16_triton_attention( value: torch.Tensor, is_causal: bool = False, scale: Optional[float] = None, - quantization_backend: _SAGE_ATTENTION_QUANTIZATION_BACKEND = "triton", - smooth_k: bool = True, return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: return sageattn_qk_int8_pv_fp16_triton( q=query, k=key, v=value, tensor_layout="NHD", - quantization_backend=quantization_backend, is_causal=is_causal, sm_scale=scale, - smooth_k=smooth_k, return_lse=return_lse, ) @@ -1275,7 +1917,12 @@ def _xformers_attention( is_causal: bool = False, scale: Optional[float] = None, enable_gqa: bool = False, + return_lse: bool = False, + _parallel_config: Optional["ParallelConfig"] = None, ) -> torch.Tensor: + if return_lse: + raise ValueError("xformers attention backend does not support setting `return_lse=True`.") + batch_size, seq_len_q, num_heads_q, _ = query.shape _, seq_len_kv, num_heads_kv, _ = key.shape diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 2388989be2..b3d74954bd 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -65,6 +65,7 @@ from ..utils.hub_utils import ( populate_model_card, ) from ..utils.torch_utils import empty_device_cache +from ._modeling_parallel import ContextParallelConfig, ContextParallelModelPlan, ParallelConfig from .model_loading_utils import ( _caching_allocator_warmup, _determine_device_map, @@ -248,6 +249,8 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): _skip_layerwise_casting_patterns = None _supports_group_offloading = True _repeated_blocks = [] + _parallel_config = None + _cp_plan = None def __init__(self): super().__init__() @@ -620,8 +623,8 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): def reset_attention_backend(self) -> None: """ - Resets the attention backend for the model. Following calls to `forward` will use the environment default or - the torch native scaled dot product attention. + Resets the attention backend for the model. Following calls to `forward` will use the environment default, if + set, or the torch native scaled dot product attention. """ from .attention import AttentionModuleMixin from .attention_processor import Attention, MochiAttention @@ -960,6 +963,7 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): quantization_config = kwargs.pop("quantization_config", None) dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None) disable_mmap = kwargs.pop("disable_mmap", False) + parallel_config: Optional[Union[ParallelConfig, ContextParallelConfig]] = kwargs.pop("parallel_config", None) is_parallel_loading_enabled = HF_ENABLE_PARALLEL_LOADING if is_parallel_loading_enabled and not low_cpu_mem_usage: @@ -1340,6 +1344,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): # Set model in evaluation mode to deactivate DropOut modules by default model.eval() + if parallel_config is not None: + model.enable_parallelism(config=parallel_config) + if output_loading_info: return model, loading_info @@ -1478,6 +1485,73 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): f"Regional compilation failed because {repeated_blocks} classes are not found in the model. " ) + def enable_parallelism( + self, + *, + config: Union[ParallelConfig, ContextParallelConfig], + cp_plan: Optional[Dict[str, ContextParallelModelPlan]] = None, + ): + from ..hooks.context_parallel import apply_context_parallel + from .attention import AttentionModuleMixin + from .attention_processor import Attention, MochiAttention + + logger.warning( + "`enable_parallelism` is an experimental feature. The API may change in the future and breaking changes may be introduced at any time without warning." + ) + + if isinstance(config, ContextParallelConfig): + config = ParallelConfig(context_parallel_config=config) + + if not torch.distributed.is_initialized(): + raise RuntimeError("torch.distributed must be initialized before calling `enable_parallelism`.") + + rank = torch.distributed.get_rank() + world_size = torch.distributed.get_world_size() + device_type = torch._C._get_accelerator().type + device_module = torch.get_device_module(device_type) + device = torch.device(device_type, rank % device_module.device_count()) + + cp_mesh = None + if config.context_parallel_config is not None: + cp_config = config.context_parallel_config + if cp_config.ring_degree < 1 or cp_config.ulysses_degree < 1: + raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.") + if cp_config.ring_degree > 1 and cp_config.ulysses_degree > 1: + raise ValueError( + "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1." + ) + if cp_config.ring_degree * cp_config.ulysses_degree > world_size: + raise ValueError( + f"The product of `ring_degree` ({cp_config.ring_degree}) and `ulysses_degree` ({cp_config.ulysses_degree}) must not exceed the world size ({world_size})." + ) + cp_mesh = torch.distributed.device_mesh.init_device_mesh( + device_type=device_type, + mesh_shape=(cp_config.ring_degree, cp_config.ulysses_degree), + mesh_dim_names=("ring", "ulysses"), + ) + + config.setup(rank, world_size, device, cp_mesh=cp_mesh) + + if cp_plan is None and self._cp_plan is None: + raise ValueError( + "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute." + ) + cp_plan = cp_plan if cp_plan is not None else self._cp_plan + + if config.context_parallel_config is not None: + apply_context_parallel(self, config.context_parallel_config, cp_plan) + + self._parallel_config = config + + attention_classes = (Attention, MochiAttention, AttentionModuleMixin) + for module in self.modules(): + if not isinstance(module, attention_classes): + continue + processor = module.processor + if processor is None or not hasattr(processor, "_parallel_config"): + continue + processor._parallel_config = config + @classmethod def _load_pretrained_model( cls, diff --git a/src/diffusers/models/transformers/transformer_bria.py b/src/diffusers/models/transformers/transformer_bria.py index 04a9c5645c..d54679306e 100644 --- a/src/diffusers/models/transformers/transformer_bria.py +++ b/src/diffusers/models/transformers/transformer_bria.py @@ -120,6 +120,7 @@ def get_1d_rotary_pos_embed( class BriaAttnProcessor: _attention_backend = None + _parallel_config = None def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): @@ -161,7 +162,12 @@ class BriaAttnProcessor: key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1) hidden_states = dispatch_attention_fn( - query, key, value, attn_mask=attention_mask, backend=self._attention_backend + query, + key, + value, + attn_mask=attention_mask, + backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.to(query.dtype) diff --git a/src/diffusers/models/transformers/transformer_flux.py b/src/diffusers/models/transformers/transformer_flux.py index 7ab371a1a1..1a44644324 100644 --- a/src/diffusers/models/transformers/transformer_flux.py +++ b/src/diffusers/models/transformers/transformer_flux.py @@ -24,6 +24,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import maybe_allow_in_graph +from .._modeling_parallel import ContextParallelInput, ContextParallelOutput from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward from ..attention_dispatch import dispatch_attention_fn from ..cache_utils import CacheMixin @@ -73,6 +74,7 @@ def _get_qkv_projections(attn: "FluxAttention", hidden_states, encoder_hidden_st class FluxAttnProcessor: _attention_backend = None + _parallel_config = None def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): @@ -114,7 +116,12 @@ class FluxAttnProcessor: key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1) hidden_states = dispatch_attention_fn( - query, key, value, attn_mask=attention_mask, backend=self._attention_backend + query, + key, + value, + attn_mask=attention_mask, + backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.to(query.dtype) @@ -136,6 +143,7 @@ class FluxIPAdapterAttnProcessor(torch.nn.Module): """Flux Attention processor for IP-Adapter.""" _attention_backend = None + _parallel_config = None def __init__( self, hidden_size: int, cross_attention_dim: int, num_tokens=(4,), scale=1.0, device=None, dtype=None @@ -220,6 +228,7 @@ class FluxIPAdapterAttnProcessor(torch.nn.Module): dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.to(query.dtype) @@ -252,6 +261,7 @@ class FluxIPAdapterAttnProcessor(torch.nn.Module): dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) current_ip_hidden_states = current_ip_hidden_states.reshape(batch_size, -1, attn.heads * attn.head_dim) current_ip_hidden_states = current_ip_hidden_states.to(ip_query.dtype) @@ -556,6 +566,15 @@ class FluxTransformer2DModel( _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"] _skip_layerwise_casting_patterns = ["pos_embed", "norm"] _repeated_blocks = ["FluxTransformerBlock", "FluxSingleTransformerBlock"] + _cp_plan = { + "": { + "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "img_ids": ContextParallelInput(split_dim=0, expected_dims=2, split_output=False), + "txt_ids": ContextParallelInput(split_dim=0, expected_dims=2, split_output=False), + }, + "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3), + } @register_to_config def __init__( diff --git a/src/diffusers/models/transformers/transformer_ltx.py b/src/diffusers/models/transformers/transformer_ltx.py index 79149fb760..9f3840690d 100644 --- a/src/diffusers/models/transformers/transformer_ltx.py +++ b/src/diffusers/models/transformers/transformer_ltx.py @@ -24,6 +24,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FromOriginalModelMixin, PeftAdapterMixin from ...utils import USE_PEFT_BACKEND, deprecate, is_torch_version, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import maybe_allow_in_graph +from .._modeling_parallel import ContextParallelInput, ContextParallelOutput from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward from ..attention_dispatch import dispatch_attention_fn from ..cache_utils import CacheMixin @@ -51,6 +52,7 @@ class LTXVideoAttnProcessor: """ _attention_backend = None + _parallel_config = None def __init__(self): if is_torch_version("<", "2.0"): @@ -100,6 +102,7 @@ class LTXVideoAttnProcessor: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.to(query.dtype) @@ -409,6 +412,18 @@ class LTXVideoTransformer3DModel( _supports_gradient_checkpointing = True _skip_layerwise_casting_patterns = ["norm"] _repeated_blocks = ["LTXVideoTransformerBlock"] + _cp_plan = { + "": { + "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "encoder_attention_mask": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False), + }, + "rope": { + 0: ContextParallelInput(split_dim=1, expected_dims=3, split_output=True), + 1: ContextParallelInput(split_dim=1, expected_dims=3, split_output=True), + }, + "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3), + } @register_to_config def __init__( diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py index 846add8906..05379270c1 100644 --- a/src/diffusers/models/transformers/transformer_qwenimage.py +++ b/src/diffusers/models/transformers/transformer_qwenimage.py @@ -25,6 +25,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FromOriginalModelMixin, PeftAdapterMixin from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import maybe_allow_in_graph +from .._modeling_parallel import ContextParallelInput, ContextParallelOutput from ..attention import AttentionMixin, FeedForward from ..attention_dispatch import dispatch_attention_fn from ..attention_processor import Attention @@ -261,6 +262,7 @@ class QwenDoubleStreamAttnProcessor2_0: """ _attention_backend = None + _parallel_config = None def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): @@ -334,6 +336,7 @@ class QwenDoubleStreamAttnProcessor2_0: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) # Reshape back @@ -502,6 +505,18 @@ class QwenImageTransformer2DModel( _no_split_modules = ["QwenImageTransformerBlock"] _skip_layerwise_casting_patterns = ["pos_embed", "norm"] _repeated_blocks = ["QwenImageTransformerBlock"] + _cp_plan = { + "": { + "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + "encoder_hidden_states_mask": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False), + }, + "pos_embed": { + 0: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True), + 1: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True), + }, + "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3), + } @register_to_config def __init__( diff --git a/src/diffusers/models/transformers/transformer_skyreels_v2.py b/src/diffusers/models/transformers/transformer_skyreels_v2.py index 358759164b..6b600aa224 100644 --- a/src/diffusers/models/transformers/transformer_skyreels_v2.py +++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py @@ -73,6 +73,7 @@ def _get_added_kv_projections(attn: "SkyReelsV2Attention", encoder_hidden_states class SkyReelsV2AttnProcessor: _attention_backend = None + _parallel_config = None def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): @@ -139,6 +140,7 @@ class SkyReelsV2AttnProcessor: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states_img = hidden_states_img.flatten(2, 3) hidden_states_img = hidden_states_img.type_as(query) @@ -151,6 +153,7 @@ class SkyReelsV2AttnProcessor: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index 968a0369c2..25c055fb56 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -23,6 +23,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import FromOriginalModelMixin, PeftAdapterMixin from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers from ...utils.torch_utils import maybe_allow_in_graph +from .._modeling_parallel import ContextParallelInput, ContextParallelOutput from ..attention import AttentionMixin, AttentionModuleMixin, FeedForward from ..attention_dispatch import dispatch_attention_fn from ..cache_utils import CacheMixin @@ -66,6 +67,7 @@ def _get_added_kv_projections(attn: "WanAttention", encoder_hidden_states_img: t class WanAttnProcessor: _attention_backend = None + _parallel_config = None def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): @@ -132,6 +134,7 @@ class WanAttnProcessor: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states_img = hidden_states_img.flatten(2, 3) hidden_states_img = hidden_states_img.type_as(query) @@ -144,6 +147,7 @@ class WanAttnProcessor: dropout_p=0.0, is_causal=False, backend=self._attention_backend, + parallel_config=self._parallel_config, ) hidden_states = hidden_states.flatten(2, 3) hidden_states = hidden_states.type_as(query) @@ -539,6 +543,19 @@ class WanTransformer3DModel( _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"] _keys_to_ignore_on_load_unexpected = ["norm_added_q"] _repeated_blocks = ["WanTransformerBlock"] + _cp_plan = { + "rope": { + 0: ContextParallelInput(split_dim=1, expected_dims=4, split_output=True), + 1: ContextParallelInput(split_dim=1, expected_dims=4, split_output=True), + }, + "blocks.0": { + "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + }, + "blocks.*": { + "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False), + }, + "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3), + } @register_to_config def __init__( diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index bbb9712496..6e7d227979 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -648,6 +648,21 @@ class ConsistencyDecoderVAE(metaclass=DummyObject): requires_backends(cls, ["torch"]) +class ContextParallelConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class ControlNetModel(metaclass=DummyObject): _backends = ["torch"] @@ -1053,6 +1068,21 @@ class OmniGenTransformer2DModel(metaclass=DummyObject): requires_backends(cls, ["torch"]) +class ParallelConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class PixArtTransformer2DModel(metaclass=DummyObject): _backends = ["torch"] From 310fdaf5561d1b20240a2b66e978edb66175ad5c Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Thu, 25 Sep 2025 01:50:57 +0800 Subject: [PATCH 14/18] Introduce cache-dit to community optimization (#12366) * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * misc: update examples link * misc: update examples link * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * docs: introduce cache-dit to diffusers * Refine documentation for CacheDiT features Updated the wording for clarity and consistency in the documentation. Adjusted sections on cache acceleration, automatic block adapter, patch functor, and hybrid cache configuration. --- docs/source/en/_toctree.yml | 2 + docs/source/en/optimization/cache_dit.md | 270 +++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 docs/source/en/optimization/cache_dit.md diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 07408cc30e..96c6fbb17f 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -82,6 +82,8 @@ title: Token merging - local: optimization/deepcache title: DeepCache + - local: optimization/cache_dit + title: CacheDiT - local: optimization/tgate title: TGATE - local: optimization/xdit diff --git a/docs/source/en/optimization/cache_dit.md b/docs/source/en/optimization/cache_dit.md new file mode 100644 index 0000000000..1261423212 --- /dev/null +++ b/docs/source/en/optimization/cache_dit.md @@ -0,0 +1,270 @@ +## CacheDiT + +CacheDiT is a unified, flexible, and training-free cache acceleration framework designed to support nearly all Diffusers' DiT-based pipelines. It provides a unified cache API that supports automatic block adapter, DBCache, and more. + +To learn more, refer to the [CacheDiT](https://github.com/vipshop/cache-dit) repository. + +Install a stable release of CacheDiT from PyPI or you can install the latest version from GitHub. + + + + +```bash +pip3 install -U cache-dit +``` + + + + +```bash +pip3 install git+https://github.com/vipshop/cache-dit.git +``` + + + + +Run the command below to view supported DiT pipelines. + +```python +>>> import cache_dit +>>> cache_dit.supported_pipelines() +(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*', +'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*', +'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*', +'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*']) +``` + +For a complete benchmark, please refer to [Benchmarks](https://github.com/vipshop/cache-dit/blob/main/bench/). + + +## Unified Cache API + +CacheDiT works by matching specific input/output patterns as shown below. + +![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png) + +Call the `enable_cache()` function on a pipeline to enable cache acceleration. This function is the entry point to many of CacheDiT's features. + +```python +import cache_dit +from diffusers import DiffusionPipeline + +# Can be any diffusion pipeline +pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image") + +# One-line code with default cache options. +cache_dit.enable_cache(pipe) + +# Just call the pipe as normal. +output = pipe(...) + +# Disable cache and run original pipe. +cache_dit.disable_cache(pipe) +``` + +## Automatic Block Adapter + +For custom or modified pipelines or transformers not included in Diffusers, use the `BlockAdapter` in `auto` mode or via manual configuration. Please check the [BlockAdapter](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#automatic-block-adapter) docs for more details. Refer to [Qwen-Image w/ BlockAdapter](https://github.com/vipshop/cache-dit/blob/main/examples/adapter/run_qwen_image_adapter.py) as an example. + + +```python +from cache_dit import ForwardPattern, BlockAdapter + +# Use 🔥BlockAdapter with `auto` mode. +cache_dit.enable_cache( + BlockAdapter( + # Any DiffusionPipeline, Qwen-Image, etc. + pipe=pipe, auto=True, + # Check `📚Forward Pattern Matching` documentation and hack the code of + # of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`. + forward_pattern=ForwardPattern.Pattern_1, + ), +) + +# Or, manually setup transformer configurations. +cache_dit.enable_cache( + BlockAdapter( + pipe=pipe, # Qwen-Image, etc. + transformer=pipe.transformer, + blocks=pipe.transformer.transformer_blocks, + forward_pattern=ForwardPattern.Pattern_1, + ), +) +``` + +Sometimes, a Transformer class will contain more than one transformer `blocks`. For example, FLUX.1 (HiDream, Chroma, etc) contains `transformer_blocks` and `single_transformer_blocks` (with different forward patterns). The BlockAdapter is able to detect this hybrid pattern type as well. +Refer to [FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/adapter/run_flux_adapter.py) as an example. + +```python +# For diffusers <= 0.34.0, FLUX.1 transformer_blocks and +# single_transformer_blocks have different forward patterns. +cache_dit.enable_cache( + BlockAdapter( + pipe=pipe, # FLUX.1, etc. + transformer=pipe.transformer, + blocks=[ + pipe.transformer.transformer_blocks, + pipe.transformer.single_transformer_blocks, + ], + forward_pattern=[ + ForwardPattern.Pattern_1, + ForwardPattern.Pattern_3, + ], + ), +) +``` + +This also works if there is more than one transformer (namely `transformer` and `transformer_2`) in its structure. Refer to [Wan 2.2 MoE](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example. + +## Patch Functor + +For any pattern not included in CacheDiT, use the Patch Functor to convert the pattern into a known pattern. You need to subclass the Patch Functor and may also need to fuse the operations within the blocks for loop into block `forward`. After implementing a Patch Functor, set the `patch_functor` property in `BlockAdapter`. + +![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png) + +Some Patch Functors are already provided in CacheDiT, [HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc. + +```python +@BlockAdapterRegistry.register("HiDream") +def hidream_adapter(pipe, **kwargs) -> BlockAdapter: + from diffusers import HiDreamImageTransformer2DModel + from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor + + assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel) + return BlockAdapter( + pipe=pipe, + transformer=pipe.transformer, + blocks=[ + pipe.transformer.double_stream_blocks, + pipe.transformer.single_stream_blocks, + ], + forward_pattern=[ + ForwardPattern.Pattern_0, + ForwardPattern.Pattern_3, + ], + # NOTE: Setup your custom patch functor here. + patch_functor=HiDreamPatchFunctor(), + **kwargs, + ) +``` + +Finally, you can call the `cache_dit.summary()` function on a pipeline after its completed inference to get the cache acceleration details. + +```python +stats = cache_dit.summary(pipe) +``` + +```python +⚡️Cache Steps and Residual Diffs Statistics: QwenImagePipeline + +| Cache Steps | Diffs Min | Diffs P25 | Diffs P50 | Diffs P75 | Diffs P95 | Diffs Max | +|-------------|-----------|-----------|-----------|-----------|-----------|-----------| +| 23 | 0.045 | 0.084 | 0.114 | 0.147 | 0.241 | 0.297 | +``` + +## DBCache: Dual Block Cache + +![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png) + +DBCache (Dual Block Caching) supports different configurations of compute blocks (F8B12, etc.) to enable a balanced trade-off between performance and precision. +- Fn_compute_blocks: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks. +- Bn_compute_blocks: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache. + + +```python +import cache_dit +from diffusers import FluxPipeline + +pipe_or_adapter = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-dev", + torch_dtype=torch.bfloat16, +).to("cuda") + +# Default options, F8B0, 8 warmup steps, and unlimited cached +# steps for good balance between performance and precision +cache_dit.enable_cache(pipe_or_adapter) + +# Custom options, F8B8, higher precision +from cache_dit import BasicCacheConfig + +cache_dit.enable_cache( + pipe_or_adapter, + cache_config=BasicCacheConfig( + max_warmup_steps=8, # steps do not cache + max_cached_steps=-1, # -1 means no limit + Fn_compute_blocks=8, # Fn, F8, etc. + Bn_compute_blocks=8, # Bn, B8, etc. + residual_diff_threshold=0.12, + ), +) +``` +Check the [DBCache](https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md) and [User Guide](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#dbcache) docs for more design details. + +## TaylorSeer Calibrator + +The [TaylorSeers](https://huggingface.co/papers/2503.06923) algorithm further improves the precision of DBCache in cases where the cached steps are large (Hybrid TaylorSeer + DBCache). At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality. + +TaylorSeer employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. F_pred can be a residual cache or a hidden-state cache. + +```python +from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig + +cache_dit.enable_cache( + pipe_or_adapter, + # Basic DBCache w/ FnBn configurations + cache_config=BasicCacheConfig( + max_warmup_steps=8, # steps do not cache + max_cached_steps=-1, # -1 means no limit + Fn_compute_blocks=8, # Fn, F8, etc. + Bn_compute_blocks=8, # Bn, B8, etc. + residual_diff_threshold=0.12, + ), + # Then, you can use the TaylorSeer Calibrator to approximate + # the values in cached steps, taylorseer_order default is 1. + calibrator_config=TaylorSeerCalibratorConfig( + taylorseer_order=1, + ), +) +``` + +> [!TIP] +> The `Bn_compute_blocks` parameter of DBCache can be set to `0` if you use TaylorSeer as the calibrator for approximate hidden states. DBCache's `Bn_compute_blocks` also acts as a calibrator, so you can choose either `Bn_compute_blocks` > 0 or TaylorSeer. We recommend using the configuration scheme of TaylorSeer + DBCache FnB0. + +## Hybrid Cache CFG + +CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG in the forward step, please set `enable_separate_cfg` parameter to `False (default, None)`. Otherwise, set it to `True`. + +```python +from cache_dit import BasicCacheConfig + +cache_dit.enable_cache( + pipe_or_adapter, + cache_config=BasicCacheConfig( + ..., + # For example, set it as True for Wan 2.1, Qwen-Image + # and set it as False for FLUX.1, HunyuanVideo, etc. + enable_separate_cfg=True, + ), +) +``` + +## torch.compile + +CacheDiT is designed to work with torch.compile for even better performance. Call `torch.compile` after enabling the cache. + + +```python +cache_dit.enable_cache(pipe) + +# Compile the Transformer module +pipe.transformer = torch.compile(pipe.transformer) +``` + +If you're using CacheDiT with dynamic input shapes, consider increasing the `recompile_limit` of `torch._dynamo`. Otherwise, the `recompile_limit` error may be triggered, causing the module to fall back to eager mode. + +```python +torch._dynamo.config.recompile_limit = 96 # default is 8 +torch._dynamo.config.accumulated_recompile_limit = 2048 # default is 256 +``` + +Please check [perf.py](https://github.com/vipshop/cache-dit/blob/main/bench/perf.py) for more details. From ec5449f3a1378df207df481bfa1ad7ff8057a58a Mon Sep 17 00:00:00 2001 From: Lucain Date: Thu, 25 Sep 2025 18:28:54 +0200 Subject: [PATCH 15/18] Support both huggingface_hub `v0.x` and `v1.x` (#12389) * Support huggingface_hub 0.x and 1.x * httpx --- setup.py | 4 +++- src/diffusers/configuration_utils.py | 4 ++-- src/diffusers/dependency_versions_table.py | 3 ++- src/diffusers/models/modeling_flax_utils.py | 4 ++-- src/diffusers/pipelines/pipeline_loading_utils.py | 6 +++--- src/diffusers/pipelines/pipeline_utils.py | 6 +++--- src/diffusers/utils/hub_utils.py | 6 +++--- tests/models/test_modeling_common.py | 7 +++---- tests/pipelines/test_pipelines.py | 6 +++--- 9 files changed, 24 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index ba3ad8e2b3..372a568595 100644 --- a/setup.py +++ b/setup.py @@ -102,7 +102,8 @@ _deps = [ "filelock", "flax>=0.4.1", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.34.0", + "httpx<1.0.0", + "huggingface-hub>=0.34.0,<2.0", "requests-mock==1.10.0", "importlib_metadata", "invisible-watermark>=0.2.0", @@ -259,6 +260,7 @@ extras["dev"] = ( install_requires = [ deps["importlib_metadata"], deps["filelock"], + deps["httpx"], deps["huggingface-hub"], deps["numpy"], deps["regex"], diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 540aab0307..1c4ee33acb 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -30,11 +30,11 @@ import numpy as np from huggingface_hub import DDUFEntry, create_repo, hf_hub_download from huggingface_hub.utils import ( EntryNotFoundError, + HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError, validate_hf_hub_args, ) -from requests import HTTPError from typing_extensions import Self from . import __version__ @@ -419,7 +419,7 @@ class ConfigMixin: raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}." ) - except HTTPError as err: + except HfHubHTTPError as err: raise EnvironmentError( "There was a specific connection error when trying to load" f" {pretrained_model_name_or_path}:\n{err}" diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 79dc4c50a0..bfc4e9818b 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -9,7 +9,8 @@ deps = { "filelock": "filelock", "flax": "flax>=0.4.1", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.34.0", + "httpx": "httpx<1.0.0", + "huggingface-hub": "huggingface-hub>=0.34.0,<2.0", "requests-mock": "requests-mock==1.10.0", "importlib_metadata": "importlib_metadata", "invisible-watermark": "invisible-watermark>=0.2.0", diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py index 573828dc4b..8050afff27 100644 --- a/src/diffusers/models/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -26,11 +26,11 @@ from flax.traverse_util import flatten_dict, unflatten_dict from huggingface_hub import create_repo, hf_hub_download from huggingface_hub.utils import ( EntryNotFoundError, + HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError, validate_hf_hub_args, ) -from requests import HTTPError from .. import __version__, is_torch_available from ..utils import ( @@ -385,7 +385,7 @@ class FlaxModelMixin(PushToHubMixin): raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}." ) - except HTTPError as err: + except HfHubHTTPError as err: raise EnvironmentError( f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n" f"{err}" diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py index 388128df0e..b7a3e08105 100644 --- a/src/diffusers/pipelines/pipeline_loading_utils.py +++ b/src/diffusers/pipelines/pipeline_loading_utils.py @@ -19,12 +19,12 @@ import warnings from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union +import httpx import requests import torch from huggingface_hub import DDUFEntry, ModelCard, model_info, snapshot_download -from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args +from huggingface_hub.utils import HfHubHTTPError, OfflineModeIsEnabled, validate_hf_hub_args from packaging import version -from requests.exceptions import HTTPError from .. import __version__ from ..utils import ( @@ -1110,7 +1110,7 @@ def _download_dduf_file( if not local_files_only: try: info = model_info(pretrained_model_name, token=token, revision=revision) - except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e: + except (HfHubHTTPError, OfflineModeIsEnabled, requests.ConnectionError, httpx.NetworkError) as e: logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.") local_files_only = True model_info_call_error = e # save error to reraise it if model is not cached locally diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 01b3c56777..3f6e53099b 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -23,6 +23,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union, get_args, get_origin +import httpx import numpy as np import PIL.Image import requests @@ -36,9 +37,8 @@ from huggingface_hub import ( read_dduf_file, snapshot_download, ) -from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args +from huggingface_hub.utils import HfHubHTTPError, OfflineModeIsEnabled, validate_hf_hub_args from packaging import version -from requests.exceptions import HTTPError from tqdm.auto import tqdm from typing_extensions import Self @@ -1616,7 +1616,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin): if not local_files_only: try: info = model_info(pretrained_model_name, token=token, revision=revision) - except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e: + except (HfHubHTTPError, OfflineModeIsEnabled, requests.ConnectionError, httpx.NetworkError) as e: logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.") local_files_only = True model_info_call_error = e # save error to reraise it if model is not cached locally diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index fcdf49156a..b6e99452aa 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -38,13 +38,13 @@ from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY, HF_HUB_OFFLINE from huggingface_hub.file_download import REGEX_COMMIT_HASH from huggingface_hub.utils import ( EntryNotFoundError, + HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError, is_jinja_available, validate_hf_hub_args, ) from packaging import version -from requests import HTTPError from .. import __version__ from .constants import ( @@ -316,7 +316,7 @@ def _get_model_file( raise EnvironmentError( f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}." ) from e - except HTTPError as e: + except HfHubHTTPError as e: raise EnvironmentError( f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{e}" ) from e @@ -432,7 +432,7 @@ def _get_checkpoint_shard_files( # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so # we don't have to catch them here. We have also dealt with EntryNotFoundError. - except HTTPError as e: + except HfHubHTTPError as e: raise EnvironmentError( f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load {pretrained_model_name_or_path}. You should try" " again after checking your internet connection." diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index 5e7be62342..3a008edfe1 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -37,9 +37,8 @@ import torch import torch.nn as nn from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size from huggingface_hub import ModelCard, delete_repo, snapshot_download, try_to_load_from_cache -from huggingface_hub.utils import is_jinja_available +from huggingface_hub.utils import HfHubHTTPError, is_jinja_available from parameterized import parameterized -from requests.exceptions import HTTPError from diffusers.models import FluxTransformer2DModel, SD3Transformer2DModel, UNet2DConditionModel from diffusers.models.attention_processor import ( @@ -272,7 +271,7 @@ class ModelUtilsTest(unittest.TestCase): response_mock = mock.Mock() response_mock.status_code = 500 response_mock.headers = {} - response_mock.raise_for_status.side_effect = HTTPError + response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock()) response_mock.json.return_value = {} # Download this model to make sure it's in the cache. @@ -296,7 +295,7 @@ class ModelUtilsTest(unittest.TestCase): error_response = mock.Mock( status_code=500, headers={}, - raise_for_status=mock.Mock(side_effect=HTTPError), + raise_for_status=mock.Mock(side_effect=HfHubHTTPError("Server down", response=mock.Mock())), json=mock.Mock(return_value={}), ) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 09df140f1a..3a69813612 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -33,9 +33,9 @@ import safetensors.torch import torch import torch.nn as nn from huggingface_hub import snapshot_download +from huggingface_hub.utils import HfHubHTTPError from parameterized import parameterized from PIL import Image -from requests.exceptions import HTTPError from transformers import CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import ( @@ -430,7 +430,7 @@ class DownloadTests(unittest.TestCase): response_mock = mock.Mock() response_mock.status_code = 500 response_mock.headers = {} - response_mock.raise_for_status.side_effect = HTTPError + response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock()) response_mock.json.return_value = {} # Download this model to make sure it's in the cache. @@ -457,7 +457,7 @@ class DownloadTests(unittest.TestCase): response_mock = mock.Mock() response_mock.status_code = 500 response_mock.headers = {} - response_mock.raise_for_status.side_effect = HTTPError + response_mock.raise_for_status.side_effect = HfHubHTTPError("Server down", response=mock.Mock()) response_mock.json.return_value = {} # first check that with local files only the pipeline can only be used if cached From 4588bbeb4229fd307119257e273a424b370573b1 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 26 Sep 2025 18:41:17 +0530 Subject: [PATCH 16/18] [CI] disable installing transformers from main in ci for now. (#12397) * disable installing transformers from main in ci for now. * up * u[p --- .github/workflows/pr_modular_tests.yml | 5 +++-- .github/workflows/pr_tests.yml | 12 +++++++----- .github/workflows/pr_tests_gpu.yml | 13 ++++++++----- tests/pipelines/kandinsky/test_kandinsky.py | 4 +++- .../pipelines/kandinsky/test_kandinsky_combined.py | 12 +++++++++--- tests/pipelines/kandinsky/test_kandinsky_img2img.py | 4 +++- tests/pipelines/kandinsky/test_kandinsky_inpaint.py | 4 +++- 7 files changed, 36 insertions(+), 18 deletions(-) diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml index e01345e325..c6e87e642d 100644 --- a/.github/workflows/pr_modular_tests.yml +++ b/.github/workflows/pr_modular_tests.yml @@ -110,8 +110,9 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] - pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps - pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps - name: Environment run: | diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 34a344528e..ebfe9f442f 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -116,8 +116,9 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] - pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps - pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps - name: Environment run: | @@ -253,9 +254,10 @@ jobs: python -m uv pip install -e [quality,test] # TODO (sayakpaul, DN6): revisit `--no-deps` python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps - python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps - python -m uv pip install -U tokenizers - pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # python -m uv pip install -U tokenizers + # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps - name: Environment run: | diff --git a/.github/workflows/pr_tests_gpu.yml b/.github/workflows/pr_tests_gpu.yml index 45294c89fe..1a8d5f6b81 100644 --- a/.github/workflows/pr_tests_gpu.yml +++ b/.github/workflows/pr_tests_gpu.yml @@ -132,8 +132,9 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] - pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git - pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git + # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps - name: Environment run: | @@ -203,8 +204,9 @@ jobs: python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install peft@git+https://github.com/huggingface/peft.git - pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git - pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git + # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps - name: Environment run: | @@ -266,7 +268,8 @@ jobs: - name: Install dependencies run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" - pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps + # Stopping this update temporarily until the Hub RC is fully shipped and integrated. + # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps python -m uv pip install -e [quality,test,training] - name: Environment diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 9fa39b1bf5..6207e71df8 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -218,7 +218,9 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase): return dummy.get_dummy_inputs(device=device, seed=seed) @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index ca80461d87..eba8976597 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -76,7 +76,9 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase) return inputs @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky(self): device = "cpu" @@ -187,7 +189,9 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te return inputs @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky(self): device = "cpu" @@ -301,7 +305,9 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te return inputs @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index 6bcd9587f2..6d1b43a24f 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -240,7 +240,9 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): return dummies.get_dummy_inputs(device=device, seed=seed) @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky_img2img(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index 6383ca71ef..e2f4aa2a4f 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -234,7 +234,9 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): return dummies.get_dummy_inputs(device=device, seed=seed) @pytest.mark.xfail( - condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + condition=is_transformers_version(">=", "4.56.2"), + reason="Latest transformers changes the slices", + strict=False, ) def test_kandinsky_inpaint(self): device = "cpu" From 9c0944581a386736bc808e68d7dfb52d8cf1790e Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 26 Sep 2025 21:50:16 +0530 Subject: [PATCH 17/18] [docs] slight edits to the attention backends docs. (#12394) * slight edits to the attention backends docs. * Update docs/source/en/optimization/attention_backends.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/optimization/attention_backends.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/source/en/optimization/attention_backends.md b/docs/source/en/optimization/attention_backends.md index 04c8b4ba92..e603878a63 100644 --- a/docs/source/en/optimization/attention_backends.md +++ b/docs/source/en/optimization/attention_backends.md @@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. --> # Attention backends -> [!TIP] +> [!NOTE] > The attention dispatcher is an experimental feature. Please open an issue if you have any feedback or encounter any problems. Diffusers provides several optimized attention algorithms that are more memory and computationally efficient through it's *attention dispatcher*. The dispatcher acts as a router for managing and switching between different attention implementations and provides a unified interface for interacting with them. @@ -33,7 +33,7 @@ The [`~ModelMixin.set_attention_backend`] method iterates through all the module The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup. -> [!TIP] +> [!NOTE] > FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`. ```py @@ -78,10 +78,16 @@ with attention_backend("_flash_3_hub"): image = pipeline(prompt).images[0] ``` +> [!TIP] +> Most attention backends support `torch.compile` without graph breaks and can be used to further speed up inference. + ## Available backends Refer to the table below for a complete list of available attention backends and their variants. +
+Expand + | Backend Name | Family | Description | |--------------|--------|-------------| | `native` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Default backend using PyTorch's scaled_dot_product_attention | @@ -104,3 +110,5 @@ Refer to the table below for a complete list of available attention backends and | `_sage_qk_int8_pv_fp16_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (CUDA) | | `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) | | `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention | + +
\ No newline at end of file From 041501aea92919c9c7f36e189fc9cf7d865ebb96 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 26 Sep 2025 22:38:43 +0530 Subject: [PATCH 18/18] [docs] remove docstrings from repeated methods in `lora_pipeline.py` (#12393) * start unbloating docstrings (save_lora_weights). * load_lora_weights() * lora_state_dict * fuse_lora * unfuse_lora * load_lora_into_transformer --- src/diffusers/loaders/lora_pipeline.py | 2192 +----------------------- 1 file changed, 86 insertions(+), 2106 deletions(-) diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py index 8060b519f1..65bdae6920 100644 --- a/src/diffusers/loaders/lora_pipeline.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -621,33 +621,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and - `self.text_encoder`. - - All kwargs are forwarded to `self.lora_state_dict`. - - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is - loaded. - - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is - loaded into `self.unet`. - - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state - dict is loaded into `self.text_encoder`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -967,35 +941,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata=None, ): r""" - Save the LoRA parameters corresponding to the UNet and text encoder. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `unet`. - text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text - encoder LoRA state dict because it comes from 🤗 Transformers. - text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text - encoder LoRA state dict because it comes from 🤗 Transformers. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - unet_lora_adapter_metadata: - LoRA adapter metadata associated with the unet to be serialized with the state dict. - text_encoder_lora_adapter_metadata: - LoRA adapter metadata associated with the text encoder to be serialized with the state dict. - text_encoder_2_lora_adapter_metadata: - LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -1036,35 +982,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -1076,21 +994,7 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): def unfuse_lora(self, components: List[str] = ["unet", "text_encoder", "text_encoder_2"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. - unfuse_text_encoder (`bool`, defaults to `True`): - Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the - LoRA parameters then it won't have any effect. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -1116,51 +1020,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -1214,30 +1074,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and - `self.text_encoder`. - - All kwargs are forwarded to `self.lora_state_dict`. - - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is - loaded. - - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -1306,26 +1143,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`SD3Transformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -1420,35 +1238,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata=None, ): r""" - Save the LoRA parameters corresponding to the UNet and text encoder. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text - encoder LoRA state dict because it comes from 🤗 Transformers. - text_encoder_2_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `text_encoder_2`. Must explicitly pass the text - encoder LoRA state dict because it comes from 🤗 Transformers. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. - text_encoder_lora_adapter_metadata: - LoRA adapter metadata associated with the text encoder to be serialized with the state dict. - text_encoder_2_lora_adapter_metadata: - LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -1490,35 +1280,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -1531,21 +1293,7 @@ class SD3LoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin.unfuse_lora with unet->transformer def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder", "text_encoder_2"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. - unfuse_text_encoder (`bool`, defaults to `True`): - Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the - LoRA parameters then it won't have any effect. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -1567,51 +1315,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -1666,25 +1370,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -1730,26 +1416,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`AuraFlowTransformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -1781,25 +1448,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -1831,35 +1480,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -1872,18 +1493,7 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer", "text_encoder"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -1910,50 +1520,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -2207,30 +1774,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin): hotswap: bool = False, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - network_alphas (`Dict[str, float]`): - The value of the network alpha used for stable learning and preventing underflow. This value has the - same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this - link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning). - transformer (`FluxTransformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"): raise ValueError( @@ -2435,35 +1979,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ transformer = getattr(self, self.transformer_name) if not hasattr(self, "transformer") else self.transformer @@ -2806,30 +2322,7 @@ class AmusedLoraLoaderMixin(StableDiffusionLoraLoaderMixin): hotswap: bool = False, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - network_alphas (`Dict[str, float]`): - The value of the network alpha used for stable learning and preventing underflow. This value has the - same meaning as the `--network_alpha` option in the kohya-ss trainer script. Refer to [this - link](https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning). - transformer (`UVit2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and not is_peft_version(">=", "0.13.1"): raise ValueError( @@ -2979,51 +2472,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -3077,25 +2526,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -3141,26 +2572,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`CogVideoXTransformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3180,7 +2592,6 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): ) @classmethod - # Adapted from diffusers.loaders.lora_pipeline.StableDiffusionLoraLoaderMixin.save_lora_weights without support for text encoder def save_lora_weights( cls, save_directory: Union[str, os.PathLike], @@ -3192,25 +2603,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -3241,35 +2634,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -3281,18 +2646,7 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -3314,51 +2668,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -3413,25 +2723,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -3477,26 +2769,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`MochiTransformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3528,25 +2801,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -3578,35 +2833,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -3619,18 +2846,7 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -3651,50 +2867,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -3753,25 +2926,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -3817,26 +2972,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`LTXVideoTransformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -3868,25 +3004,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -3918,35 +3036,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -3959,18 +3049,7 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -3992,51 +3071,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -4091,25 +3126,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -4155,26 +3172,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`SanaTransformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4206,25 +3204,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -4256,35 +3236,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -4297,18 +3249,7 @@ class SanaLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -4329,50 +3270,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading original format HunyuanVideo LoRA checkpoints. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -4431,25 +3329,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -4495,26 +3375,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`HunyuanVideoTransformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4546,25 +3407,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -4596,35 +3439,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -4637,18 +3452,7 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -4669,50 +3473,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -4772,25 +3533,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -4836,26 +3579,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`Lumina2Transformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -4887,25 +3611,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -4937,35 +3643,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -4978,18 +3656,7 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -5010,50 +3677,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -5159,25 +3783,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -5247,26 +3853,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`WanTransformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -5298,25 +3885,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -5348,35 +3917,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -5389,18 +3930,7 @@ class WanLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -5422,50 +3952,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -5573,25 +4060,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -5661,26 +4130,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`SkyReelsV2Transformer3DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -5712,25 +4162,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -5762,35 +4194,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -5803,18 +4207,7 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -5836,51 +4229,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -5935,25 +4284,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -5999,26 +4330,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`CogView4Transformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -6050,25 +4362,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -6100,35 +4394,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -6141,18 +4407,7 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -6173,50 +4428,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -6275,25 +4487,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -6339,26 +4533,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`HiDreamImageTransformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -6390,25 +4565,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -6440,35 +4597,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -6481,18 +4610,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs) @@ -6513,51 +4631,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Return state dict for lora weights and the network alphas. - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`ModelMixin.save_pretrained`]. - - A [torch state - dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only (`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_lora_metadata (`bool`, *optional*, defaults to False): - When enabled, additionally return the LoRA adapter metadata, typically found in the state dict. - + See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details. """ # Load the main state dict first which has the LoRA layers for either of # transformer and text encoder or both. @@ -6618,25 +4692,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): """ - Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.transformer` and - `self.text_encoder`. All kwargs are forwarded to `self.lora_state_dict`. See - [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded. - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_transformer`] for more details on how the state - dict is loaded into `self.transformer`. - - Parameters: - pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - kwargs (`dict`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.lora_state_dict`]. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for more details. """ if not USE_PEFT_BACKEND: raise ValueError("PEFT backend is required for this method.") @@ -6682,26 +4738,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): metadata=None, ): """ - This will load the LoRA layers specified in `state_dict` into `transformer`. - - Parameters: - state_dict (`dict`): - A standard state dict containing the lora layer parameters. The keys can either be indexed directly - into the unet or prefixed with an additional `unet` which can be used to distinguish between text - encoder lora layers. - transformer (`QwenImageTransformer2DModel`): - The Transformer model to load the LoRA layers into. - adapter_name (`str`, *optional*): - Adapter name to be used for referencing the loaded adapter model. If not specified, it will use - `default_{i}` where i is the total number of adapters being loaded. - low_cpu_mem_usage (`bool`, *optional*): - Speed up model loading by only loading the pretrained LoRA weights and not initializing the random - weights. - hotswap (`bool`, *optional*): - See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`]. - metadata (`dict`): - Optional LoRA adapter metadata. When supplied, the `LoraConfig` arguments of `peft` won't be derived - from the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_into_unet`] for more details. """ if low_cpu_mem_usage and is_peft_version("<", "0.13.0"): raise ValueError( @@ -6733,25 +4770,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: Optional[dict] = None, ): r""" - Save the LoRA parameters corresponding to the transformer. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save LoRA parameters to. Will be created if it doesn't exist. - transformer_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `transformer`. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - transformer_lora_adapter_metadata: - LoRA adapter metadata associated with the transformer to be serialized with the state dict. + See [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for more information. """ lora_layers = {} lora_metadata = {} @@ -6783,35 +4802,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): **kwargs, ): r""" - Fuses the LoRA parameters into the original parameters of the corresponding blocks. - - - - This is an experimental API. - - - - Args: - components: (`List[str]`): List of LoRA-injectable components to fuse the LoRAs into. - lora_scale (`float`, defaults to 1.0): - Controls how much to influence the outputs with the LoRA parameters. - safe_fusing (`bool`, defaults to `False`): - Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them. - adapter_names (`List[str]`, *optional*): - Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused. - - Example: - - ```py - from diffusers import DiffusionPipeline - import torch - - pipeline = DiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 - ).to("cuda") - pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel") - pipeline.fuse_lora(lora_scale=0.7) - ``` + See [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] for more details. """ super().fuse_lora( components=components, @@ -6824,18 +4815,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.unfuse_lora def unfuse_lora(self, components: List[str] = ["transformer"], **kwargs): r""" - Reverses the effect of - [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraBaseMixin.fuse_lora). - - - - This is an experimental API. - - - - Args: - components (`List[str]`): List of LoRA-injectable components to unfuse LoRA from. - unfuse_transformer (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters. + See [`~loaders.StableDiffusionLoraLoaderMixin.unfuse_lora`] for more details. """ super().unfuse_lora(components=components, **kwargs)