diff --git a/docs/source/en/api/pipelines/qwenimage.md b/docs/source/en/api/pipelines/qwenimage.md index 2dec47309c..4c999bca35 100644 --- a/docs/source/en/api/pipelines/qwenimage.md +++ b/docs/source/en/api/pipelines/qwenimage.md @@ -26,6 +26,7 @@ Qwen-Image comes in the following variants: |:----------:|:--------:| | Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) | | Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) | +| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) | @@ -96,6 +97,29 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan +## Multi-image reference with QwenImageEditPlusPipeline + +With [`QwenImageEditPlusPipeline`], one can provide multiple images as input reference. + +``` +import torch +from PIL import Image +from diffusers import QwenImageEditPlusPipeline +from diffusers.utils import load_image + +pipe = QwenImageEditPlusPipeline.from_pretrained( + "Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16 +).to("cuda") + +image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/grumpy.jpg") +image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png") +image = pipe( + image=[image_1, image_2], + prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"", + num_inference_steps=50 +).images[0] +``` + ## QwenImagePipeline [[autodoc]] QwenImagePipeline @@ -126,7 +150,15 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan - all - __call__ -## QwenImaggeControlNetPipeline +## QwenImageControlNetPipeline + +[[autodoc]] QwenImageControlNetPipeline + - all + - __call__ + +## QwenImageEditPlusPipeline + +[[autodoc]] QwenImageEditPlusPipeline - all - __call__ diff --git a/src/diffusers/loaders/lora_base.py b/src/diffusers/loaders/lora_base.py index d18c82df4f..0ee32f820b 100644 --- a/src/diffusers/loaders/lora_base.py +++ b/src/diffusers/loaders/lora_base.py @@ -1064,6 +1064,41 @@ class LoraBaseMixin: save_function(state_dict, save_path) logger.info(f"Model weights saved in {save_path}") + @classmethod + def _save_lora_weights( + cls, + save_directory: Union[str, os.PathLike], + lora_layers: Dict[str, Dict[str, Union[torch.nn.Module, torch.Tensor]]], + lora_metadata: Dict[str, Optional[dict]], + is_main_process: bool = True, + weight_name: str = None, + save_function: Callable = None, + safe_serialization: bool = True, + ): + """ + Helper method to pack and save LoRA weights and metadata. This method centralizes the saving logic for all + pipeline types. + """ + state_dict = {} + final_lora_adapter_metadata = {} + + for prefix, layers in lora_layers.items(): + state_dict.update(cls.pack_weights(layers, prefix)) + + for prefix, metadata in lora_metadata.items(): + if metadata: + final_lora_adapter_metadata.update(_pack_dict_with_prefix(metadata, prefix)) + + cls.write_lora_layers( + state_dict=state_dict, + save_directory=save_directory, + is_main_process=is_main_process, + weight_name=weight_name, + save_function=save_function, + safe_serialization=safe_serialization, + lora_adapter_metadata=final_lora_adapter_metadata if final_lora_adapter_metadata else None, + ) + @classmethod def _optionally_disable_offloading(cls, _pipeline): return _func_optionally_disable_offloading(_pipeline=_pipeline) diff --git a/src/diffusers/loaders/lora_pipeline.py b/src/diffusers/loaders/lora_pipeline.py index 7e89066f1f..8060b519f1 100644 --- a/src/diffusers/loaders/lora_pipeline.py +++ b/src/diffusers/loaders/lora_pipeline.py @@ -510,35 +510,28 @@ class StableDiffusionLoraLoaderMixin(LoraBaseMixin): text_encoder_lora_adapter_metadata: LoRA adapter metadata associated with the text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (unet_lora_layers or text_encoder_lora_layers): - raise ValueError("You must pass at least one of `unet_lora_layers` and `text_encoder_lora_layers`.") + lora_layers = {} + lora_metadata = {} if unet_lora_layers: - state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name)) + lora_layers[cls.unet_name] = unet_lora_layers + lora_metadata[cls.unet_name] = unet_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) + lora_layers[cls.text_encoder_name] = text_encoder_lora_layers + lora_metadata[cls.text_encoder_name] = text_encoder_lora_adapter_metadata - if unet_lora_adapter_metadata: - lora_adapter_metadata.update(_pack_dict_with_prefix(unet_lora_adapter_metadata, cls.unet_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `unet_lora_layers` or `text_encoder_lora_layers`.") - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -1004,44 +997,34 @@ class StableDiffusionXLLoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata: LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): - raise ValueError( - "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." - ) + lora_layers = {} + lora_metadata = {} if unet_lora_layers: - state_dict.update(cls.pack_weights(unet_lora_layers, cls.unet_name)) + lora_layers[cls.unet_name] = unet_lora_layers + lora_metadata[cls.unet_name] = unet_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) + lora_layers["text_encoder"] = text_encoder_lora_layers + lora_metadata["text_encoder"] = text_encoder_lora_adapter_metadata if text_encoder_2_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + lora_layers["text_encoder_2"] = text_encoder_2_lora_layers + lora_metadata["text_encoder_2"] = text_encoder_2_lora_adapter_metadata - if unet_lora_adapter_metadata is not None: - lora_adapter_metadata.update(_pack_dict_with_prefix(unet_lora_adapter_metadata, cls.unet_name)) - - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) + if not lora_layers: + raise ValueError( + "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, or `text_encoder_2_lora_layers`." ) - if text_encoder_2_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_2_lora_adapter_metadata, "text_encoder_2") - ) - - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -1467,46 +1450,34 @@ class SD3LoraLoaderMixin(LoraBaseMixin): text_encoder_2_lora_adapter_metadata: LoRA adapter metadata associated with the second text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (transformer_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers): - raise ValueError( - "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, `text_encoder_2_lora_layers`." - ) + lora_layers = {} + lora_metadata = {} if transformer_lora_layers: - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, "text_encoder")) + lora_layers["text_encoder"] = text_encoder_lora_layers + lora_metadata["text_encoder"] = text_encoder_lora_adapter_metadata if text_encoder_2_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_2_lora_layers, "text_encoder_2")) + lora_layers["text_encoder_2"] = text_encoder_2_lora_layers + lora_metadata["text_encoder_2"] = text_encoder_2_lora_adapter_metadata - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) + if not lora_layers: + raise ValueError( + "You must pass at least one of `transformer_lora_layers`, `text_encoder_lora_layers`, or `text_encoder_2_lora_layers`." ) - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - if text_encoder_2_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_2_lora_adapter_metadata, "text_encoder_2") - ) - - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.StableDiffusionXLLoraLoaderMixin.fuse_lora with unet->transformer @@ -1830,28 +1801,24 @@ class AuraFlowLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -2435,37 +2402,28 @@ class FluxLoraLoaderMixin(LoraBaseMixin): text_encoder_lora_adapter_metadata: LoRA adapter metadata associated with the text encoder to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} - - if not (transformer_lora_layers or text_encoder_lora_layers): - raise ValueError("You must pass at least one of `transformer_lora_layers` and `text_encoder_lora_layers`.") + lora_layers = {} + lora_metadata = {} if transformer_lora_layers: - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata if text_encoder_lora_layers: - state_dict.update(cls.pack_weights(text_encoder_lora_layers, cls.text_encoder_name)) + lora_layers[cls.text_encoder_name] = text_encoder_lora_layers + lora_metadata[cls.text_encoder_name] = text_encoder_lora_adapter_metadata - if transformer_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if text_encoder_lora_adapter_metadata: - lora_adapter_metadata.update( - _pack_dict_with_prefix(text_encoder_lora_adapter_metadata, cls.text_encoder_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -3254,28 +3212,24 @@ class CogVideoXLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) def fuse_lora( @@ -3594,28 +3548,24 @@ class Mochi1LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -3938,28 +3888,24 @@ class LTXVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4280,28 +4226,24 @@ class SanaLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4624,28 +4566,24 @@ class HunyuanVideoLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -4969,28 +4907,24 @@ class Lumina2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -5384,28 +5318,24 @@ class WanLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -5802,28 +5732,24 @@ class SkyReelsV2LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -6144,28 +6070,24 @@ class CogView4LoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora @@ -6488,28 +6410,24 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.SanaLoraLoaderMixin.fuse_lora @@ -6835,28 +6753,24 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin): transformer_lora_adapter_metadata: LoRA adapter metadata associated with the transformer to be serialized with the state dict. """ - state_dict = {} - lora_adapter_metadata = {} + lora_layers = {} + lora_metadata = {} - if not transformer_lora_layers: - raise ValueError("You must pass `transformer_lora_layers`.") + if transformer_lora_layers: + lora_layers[cls.transformer_name] = transformer_lora_layers + lora_metadata[cls.transformer_name] = transformer_lora_adapter_metadata - state_dict.update(cls.pack_weights(transformer_lora_layers, cls.transformer_name)) + if not lora_layers: + raise ValueError("You must pass at least one of `transformer_lora_layers` or `text_encoder_lora_layers`.") - if transformer_lora_adapter_metadata is not None: - lora_adapter_metadata.update( - _pack_dict_with_prefix(transformer_lora_adapter_metadata, cls.transformer_name) - ) - - # Save the model - cls.write_lora_layers( - state_dict=state_dict, + cls._save_lora_weights( save_directory=save_directory, + lora_layers=lora_layers, + lora_metadata=lora_metadata, is_main_process=is_main_process, weight_name=weight_name, save_function=save_function, safe_serialization=safe_serialization, - lora_adapter_metadata=lora_adapter_metadata, ) # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.fuse_lora diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index c99133f257..3f59c8da8e 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -241,7 +241,7 @@ class AttentionModuleMixin: op_fw, op_bw = attention_op dtype, *_ = op_fw.SUPPORTED_DTYPES q = torch.randn((1, 2, 40), device="cuda", dtype=dtype) - _ = xops.memory_efficient_attention(q, q, q) + _ = xops.ops.memory_efficient_attention(q, q, q) except Exception as e: raise e diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index d3f31de854..783f22e97d 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -617,7 +617,7 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin): returned. """ if self.use_slicing and z.size(0) > 1: - decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] + decoded_slices = [self._decode(z_slice) for z_slice in z.split(1)] decoded = torch.cat(decoded_slices) else: decoded = self._decode(z) diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma.py b/src/diffusers/pipelines/chroma/pipeline_chroma.py index f3ed700bc4..19ea7729c9 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma.py @@ -688,11 +688,11 @@ class ChromaPipeline( their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. guidance_scale (`float`, *optional*, defaults to 3.5): - Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages - a model to generate images more aligned with `prompt` at the expense of lower image quality. - - Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to - the [paper](https://huggingface.co/papers/2210.03142) to learn more. + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): diff --git a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py index 26f13fe06c..9afd4b9e15 100644 --- a/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py +++ b/src/diffusers/pipelines/chroma/pipeline_chroma_img2img.py @@ -749,12 +749,12 @@ class ChromaImg2ImgPipeline( Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed will be used. - guidance_scale (`float`, *optional*, defaults to 5.0): - Embedded guiddance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages - a model to generate images more aligned with `prompt` at the expense of lower image quality. - - Guidance-distilled models approximates true classifer-free guidance for `guidance_scale` > 1. Refer to - the [paper](https://huggingface.co/papers/2210.03142) to learn more. + guidance_scale (`float`, *optional*, defaults to 3.5): + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. strength (`float, *optional*, defaults to 0.9): Conceptually, indicates how much to transform the reference image. Must be between 0 and 1. image will be used as a starting point, adding more noise to it the larger the strength. The number of denoising diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 911c6e49ae..9fa39b1bf5 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -18,11 +18,13 @@ import random import unittest import numpy as np +import pytest import torch from transformers import XLMRobertaTokenizerFast from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import is_transformers_version from ...testing_utils import ( backend_empty_cache, @@ -215,6 +217,9 @@ class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase): dummy = Dummies() return dummy.get_dummy_inputs(device=device, seed=seed) + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_combined.py b/tests/pipelines/kandinsky/test_kandinsky_combined.py index d744d10821..ca80461d87 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_combined.py +++ b/tests/pipelines/kandinsky/test_kandinsky_combined.py @@ -16,8 +16,10 @@ import unittest import numpy as np +import pytest from diffusers import KandinskyCombinedPipeline, KandinskyImg2ImgCombinedPipeline, KandinskyInpaintCombinedPipeline +from diffusers.utils import is_transformers_version from ...testing_utils import enable_full_determinism, require_torch_accelerator, torch_device from ..test_pipelines_common import PipelineTesterMixin @@ -73,6 +75,9 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase) ) return inputs + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky(self): device = "cpu" @@ -181,6 +186,9 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te inputs.pop("negative_image_embeds") return inputs + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky(self): device = "cpu" @@ -292,6 +300,9 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te inputs.pop("negative_image_embeds") return inputs + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index 4074c8db22..6bcd9587f2 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -18,6 +18,7 @@ import random import unittest import numpy as np +import pytest import torch from PIL import Image from transformers import XLMRobertaTokenizerFast @@ -31,6 +32,7 @@ from diffusers import ( VQModel, ) from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import is_transformers_version from ...testing_utils import ( backend_empty_cache, @@ -237,6 +239,9 @@ class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): dummies = Dummies() return dummies.get_dummy_inputs(device=device, seed=seed) + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky_img2img(self): device = "cpu" diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index b789a63cdd..6383ca71ef 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -18,12 +18,14 @@ import random import unittest import numpy as np +import pytest import torch from PIL import Image from transformers import XLMRobertaTokenizerFast from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import is_transformers_version from ...testing_utils import ( backend_empty_cache, @@ -231,6 +233,9 @@ class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): dummies = Dummies() return dummies.get_dummy_inputs(device=device, seed=seed) + @pytest.mark.xfail( + condition=is_transformers_version(">=", "4.56.2"), reason="Latest transformers changes the slices", strict=True + ) def test_kandinsky_inpaint(self): device = "cpu" diff --git a/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py b/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py new file mode 100644 index 0000000000..6faf347282 --- /dev/null +++ b/tests/pipelines/qwenimage/test_qwenimage_edit_plus.py @@ -0,0 +1,253 @@ +# Copyright 2025 The HuggingFace Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import pytest +import torch +from PIL import Image +from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor + +from diffusers import ( + AutoencoderKLQwenImage, + FlowMatchEulerDiscreteScheduler, + QwenImageEditPlusPipeline, + QwenImageTransformer2DModel, +) + +from ...testing_utils import enable_full_determinism, torch_device +from ..pipeline_params import TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineTesterMixin, to_np + + +enable_full_determinism() + + +class QwenImageEditPlusPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = QwenImageEditPlusPipeline + params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"} + batch_params = frozenset(["prompt", "image"]) + image_params = frozenset(["image"]) + image_latents_params = frozenset(["latents"]) + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback_on_step_end", + "callback_on_step_end_tensor_inputs", + ] + ) + supports_dduf = False + test_xformers_attention = False + test_layerwise_casting = True + test_group_offloading = True + + def get_dummy_components(self): + tiny_ckpt_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration" + + torch.manual_seed(0) + transformer = QwenImageTransformer2DModel( + patch_size=2, + in_channels=16, + out_channels=4, + num_layers=2, + attention_head_dim=16, + num_attention_heads=3, + joint_attention_dim=16, + guidance_embeds=False, + axes_dims_rope=(8, 4, 4), + ) + + torch.manual_seed(0) + z_dim = 4 + vae = AutoencoderKLQwenImage( + base_dim=z_dim * 6, + z_dim=z_dim, + dim_mult=[1, 2, 4], + num_res_blocks=1, + temperal_downsample=[False, True], + latents_mean=[0.0] * z_dim, + latents_std=[1.0] * z_dim, + ) + + torch.manual_seed(0) + scheduler = FlowMatchEulerDiscreteScheduler() + + torch.manual_seed(0) + config = Qwen2_5_VLConfig( + text_config={ + "hidden_size": 16, + "intermediate_size": 16, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "rope_scaling": { + "mrope_section": [1, 1, 2], + "rope_type": "default", + "type": "default", + }, + "rope_theta": 1000000.0, + }, + vision_config={ + "depth": 2, + "hidden_size": 16, + "intermediate_size": 16, + "num_heads": 2, + "out_hidden_size": 16, + }, + hidden_size=16, + vocab_size=152064, + vision_end_token_id=151653, + vision_start_token_id=151652, + vision_token_id=151654, + ) + text_encoder = Qwen2_5_VLForConditionalGeneration(config) + tokenizer = Qwen2Tokenizer.from_pretrained(tiny_ckpt_id) + + components = { + "transformer": transformer, + "vae": vae, + "scheduler": scheduler, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "processor": Qwen2VLProcessor.from_pretrained(tiny_ckpt_id), + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = Image.new("RGB", (32, 32)) + inputs = { + "prompt": "dance monkey", + "image": [image, image], + "negative_prompt": "bad quality", + "generator": generator, + "num_inference_steps": 2, + "true_cfg_scale": 1.0, + "height": 32, + "width": 32, + "max_sequence_length": 16, + "output_type": "pt", + } + + return inputs + + def test_inference(self): + device = "cpu" + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + generated_image = image[0] + self.assertEqual(generated_image.shape, (3, 32, 32)) + + # fmt: off + expected_slice = torch.tensor([[0.5637, 0.6341, 0.6001, 0.5620, 0.5794, 0.5498, 0.5757, 0.6389, 0.4174, 0.3597, 0.5649, 0.4894, 0.4969, 0.5255, 0.4083, 0.4986]]) + # fmt: on + + generated_slice = generated_image.flatten() + generated_slice = torch.cat([generated_slice[:8], generated_slice[-8:]]) + self.assertTrue(torch.allclose(generated_slice, expected_slice, atol=1e-3)) + + def test_attention_slicing_forward_pass( + self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3 + ): + if not self.test_attention_slicing: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + generator_device = "cpu" + inputs = self.get_dummy_inputs(generator_device) + output_without_slicing = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=1) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing1 = pipe(**inputs)[0] + + pipe.enable_attention_slicing(slice_size=2) + inputs = self.get_dummy_inputs(generator_device) + output_with_slicing2 = pipe(**inputs)[0] + + if test_max_difference: + max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max() + max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max() + self.assertLess( + max(max_diff1, max_diff2), + expected_max_diff, + "Attention slicing should not affect the inference results", + ) + + def test_vae_tiling(self, expected_diff_max: float = 0.2): + generator_device = "cpu" + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe.to("cpu") + pipe.set_progress_bar_config(disable=None) + + # Without tiling + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_without_tiling = pipe(**inputs)[0] + + # With tiling + pipe.vae.enable_tiling( + tile_sample_min_height=96, + tile_sample_min_width=96, + tile_sample_stride_height=64, + tile_sample_stride_width=64, + ) + inputs = self.get_dummy_inputs(generator_device) + inputs["height"] = inputs["width"] = 128 + output_with_tiling = pipe(**inputs)[0] + + self.assertLess( + (to_np(output_without_tiling) - to_np(output_with_tiling)).max(), + expected_diff_max, + "VAE tiling should not affect the inference results", + ) + + @pytest.mark.xfail(condition=True, reason="Preconfigured embeddings need to be revisited.", strict=True) + def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-4, rtol=1e-4): + super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict, atol, rtol) + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_num_images_per_prompt(): + super().test_num_images_per_prompt() + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_inference_batch_consistent(): + super().test_inference_batch_consistent() + + @pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True) + def test_inference_batch_single_identical(): + super().test_inference_batch_single_identical()