diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 9d3c576107..7eb6b495ab 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1621,6 +1621,24 @@ LORA_ATTENTION_PROCESSORS = ( LoRAAttnAddedKVProcessor, ) +ADDED_KV_ATTENTION_PROCESSORS = ( + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, +) + +CROSS_ATTENTION_PROCESSORS = ( + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, +) + AttentionProcessor = Union[ AttnProcessor, AttnProcessor2_0, diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py index 126d7f5287..72157e5827 100644 --- a/src/diffusers/models/autoencoder_kl.py +++ b/src/diffusers/models/autoencoder_kl.py @@ -20,7 +20,13 @@ import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import FromOriginalVAEMixin from ..utils import BaseOutput, apply_forward_hook -from .attention_processor import AttentionProcessor, AttnProcessor +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from .modeling_utils import ModelMixin from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder @@ -228,7 +234,16 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin): """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) @apply_forward_hook def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index f89c4624cb..db05b0689c 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -21,7 +21,13 @@ from torch.nn import functional as F from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import FromOriginalControlnetMixin from ..utils import BaseOutput, logging -from .attention_processor import AttentionProcessor, AttnProcessor +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from .embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .unet_2d_blocks import ( @@ -550,7 +556,16 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin): """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice def set_attention_slice(self, slice_size): diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py index 569c3346ef..fd744e9d19 100644 --- a/src/diffusers/models/prior_transformer.py +++ b/src/diffusers/models/prior_transformer.py @@ -8,7 +8,13 @@ from torch import nn from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput from .attention import BasicTransformerBlock -from .attention_processor import AttentionProcessor, AttnProcessor +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin @@ -224,7 +230,16 @@ class PriorTransformer(ModelMixin, ConfigMixin): """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) def forward( self, diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 7930667faf..f793be49c3 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -22,7 +22,13 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import UNet2DConditionLoadersMixin from ..utils import BaseOutput, logging from .activations import get_activation -from .attention_processor import AttentionProcessor, AttnProcessor +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from .embeddings import ( GaussianFourierProjection, ImageHintTimeEmbedding, @@ -639,7 +645,16 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) def set_attention_slice(self, slice_size): r""" diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index e6a875886d..58c848fdb9 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -22,7 +22,13 @@ import torch.utils.checkpoint from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import UNet2DConditionLoadersMixin from ..utils import BaseOutput, logging -from .attention_processor import AttentionProcessor, AttnProcessor +from .attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin from .transformer_temporal import TransformerTemporalModel @@ -439,7 +445,16 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)): diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index 5e5c05a8d8..d39b2c99dd 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -22,7 +22,13 @@ import torch.utils.checkpoint from ...configuration_utils import ConfigMixin, register_to_config from ...loaders import UNet2DConditionLoadersMixin from ...models.activations import get_activation -from ...models.attention_processor import AttentionProcessor, AttnProcessor +from ...models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) from ...models.embeddings import ( TimestepEmbedding, Timesteps, @@ -571,7 +577,16 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice def set_attention_slice(self, slice_size): diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 3122fd29aa..8af417241b 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -10,6 +10,8 @@ from ...models import ModelMixin from ...models.activations import get_activation from ...models.attention import Attention from ...models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnAddedKVProcessor, AttnAddedKVProcessor2_0, @@ -844,7 +846,17 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): """ Disables custom attention processors and sets the default attention implementation. """ - self.set_attn_processor(AttnProcessor()) + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + "Cannot call `set_default_attn_processor` when attention processors are of type" + f" {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) def set_attention_slice(self, slice_size): r""" diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py index bb5335ca30..5019c7eb27 100644 --- a/tests/models/test_models_unet_2d.py +++ b/tests/models/test_models_unet_2d.py @@ -263,7 +263,7 @@ class NCSNppModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): output_slice = output[0, -3:, -3:, -1].flatten().cpu() # fmt: off - expected_output_slice = torch.tensor([-4842.8691, -6499.6631, -3800.1953, -7978.2686, -10980.7129, -20028.8535, 8148.2822, 2342.2905, 567.7608]) + expected_output_slice = torch.tensor([-4836.2178, -6487.1470, -3816.8196, -7964.9302, -10966.3037, -20043.5957, 8137.0513, 2340.3328, 544.6056]) # fmt: on self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index da7c719240..85d6f48a1b 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -673,7 +673,7 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test new_model.load_attn_procs(tmpdirname, use_safetensors=True) self.assertIn("Error no file named pytorch_lora_weights.safetensors", str(e.exception)) - def test_lora_on_off(self): + def test_lora_on_off(self, expected_max_diff=1e-3): # enable deterministic behavior for gradient checkpointing init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -697,14 +697,17 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test with torch.no_grad(): new_sample = model(**inputs_dict).sample - assert (sample - new_sample).abs().max() < 1e-4 - assert (sample - old_sample).abs().max() < 3e-3 + max_diff_new_sample = (sample - new_sample).abs().max() + max_diff_old_sample = (sample - old_sample).abs().max() + + assert max_diff_new_sample < expected_max_diff + assert max_diff_old_sample < expected_max_diff @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", ) - def test_lora_xformers_on_off(self): + def test_lora_xformers_on_off(self, expected_max_diff=1e-3): # enable deterministic behavior for gradient checkpointing init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -726,8 +729,11 @@ class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.Test model.disable_xformers_memory_efficient_attention() off_sample = model(**inputs_dict).sample - assert (sample - on_sample).abs().max() < 1e-4 - assert (sample - off_sample).abs().max() < 1e-4 + max_diff_on_sample = (sample - on_sample).abs().max() + max_diff_off_sample = (sample - off_sample).abs().max() + + assert max_diff_on_sample < expected_max_diff + assert max_diff_off_sample < expected_max_diff def test_custom_diffusion_processors(self): # enable deterministic behavior for gradient checkpointing diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py index 9ff2936cbd..9d0ac96888 100644 --- a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py @@ -218,6 +218,9 @@ class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unitte np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=1.75e-3) + @slow @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 5bd8f00591..7935a63ece 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -42,6 +42,7 @@ from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import ( CaptureLogger, enable_full_determinism, + numpy_cosine_similarity_distance, require_torch_2, require_torch_gpu, run_test_in_subprocess, @@ -760,7 +761,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): # make sure that more than 3.75 GB is allocated mem_bytes = torch.cuda.max_memory_allocated() assert mem_bytes > 3.75 * 10**9 - assert np.abs(image_sliced - image).max() < 1e-3 + max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) + assert max_diff < 1e-3 def test_stable_diffusion_vae_slicing(self): torch.cuda.reset_peak_memory_stats() @@ -792,7 +794,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): mem_bytes = torch.cuda.max_memory_allocated() assert mem_bytes > 4e9 # There is a small discrepancy at the image borders vs. a fully batched version. - assert np.abs(image_sliced - image).max() < 1e-2 + max_diff = numpy_cosine_similarity_distance(image_sliced.flatten(), image.flatten()) + assert max_diff < 1e-2 def test_stable_diffusion_vae_tiling(self): torch.cuda.reset_peak_memory_stats() @@ -837,7 +840,8 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): image = output.images assert mem_bytes < 1e10 - assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2 + max_diff = numpy_cosine_similarity_distance(image_chunked.flatten(), image.flatten()) + assert max_diff < 1e-2 def test_stable_diffusion_fp16_vs_autocast(self): # this test makes sure that the original model with autocast @@ -968,7 +972,11 @@ class StableDiffusionPipelineSlowTests(unittest.TestCase): outputs_offloaded = pipe(**inputs) mem_bytes_offloaded = torch.cuda.max_memory_allocated() - assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3 + images = outputs.images + offloaded_images = outputs_offloaded.images + + max_diff = numpy_cosine_similarity_distance(images.flatten(), offloaded_images.flatten()) + assert max_diff < 1e-3 assert mem_bytes_offloaded < mem_bytes assert mem_bytes_offloaded < 3.5 * 10**9 for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker: @@ -1075,7 +1083,9 @@ class StableDiffusionPipelineCkptTests(unittest.TestCase): generator = torch.Generator(device="cpu").manual_seed(0) image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] - assert np.max(np.abs(image - image_ckpt)) < 1e-3 + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) + + assert max_diff < 1e-3 @nightly diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 294a867828..a10e74742c 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -147,7 +147,10 @@ class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, Pipeli # override to speed the overall test timing up. def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3.25e-3) + super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=5.0e-3) + + def test_float16_inference(self): + super().test_float16_inference(expected_max_diff=1e-1) def test_stable_diffusion_panorama_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 2346492a95..3991366966 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -33,7 +33,12 @@ from diffusers import ( logging, ) from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu +from diffusers.utils.testing_utils import ( + CaptureLogger, + enable_full_determinism, + numpy_cosine_similarity_distance, + require_torch_gpu, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin @@ -473,7 +478,10 @@ class StableDiffusion2PipelineSlowTests(unittest.TestCase): outputs_offloaded = pipe(**inputs) mem_bytes_offloaded = torch.cuda.max_memory_allocated() - assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3 + images = outputs.images + images_offloaded = outputs_offloaded.images + max_diff = numpy_cosine_similarity_distance(images.flatten(), images_offloaded.flatten()) + assert max_diff < 1e-3 assert mem_bytes_offloaded < mem_bytes assert mem_bytes_offloaded < 3 * 10**9 for module in pipe.text_encoder, pipe.unet, pipe.vae: diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index b4d49c9242..3e280058b1 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -27,7 +27,7 @@ from diffusers import ( UNet2DConditionModel, ) from diffusers.utils import load_numpy, skip_mps, slow -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin @@ -226,4 +226,5 @@ class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase): expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy" ) - assert np.abs((expected_image - image).max()) < 5e-1 + max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) + assert max_diff < 5e-1 diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index 1db2e18e5b..a8c857d755 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -32,7 +32,7 @@ from diffusers import ( ) from diffusers.models.attention_processor import AttnProcessor from diffusers.utils import load_numpy, slow, torch_device -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, numpy_cosine_similarity_distance, require_torch_gpu enable_full_determinism() @@ -364,7 +364,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): # make sure that more than 5.5 GB is allocated mem_bytes = torch.cuda.max_memory_allocated() assert mem_bytes > 5.5 * 10**9 - assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3 + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_chunked.flatten()) + assert max_diff < 1e-3 def test_stable_diffusion_text2img_pipeline_v_pred_default(self): expected_image = load_numpy( @@ -384,7 +385,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): image = output.images[0] assert image.shape == (768, 768, 3) - assert np.abs(expected_image - image).max() < 9e-1 + max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) + assert max_diff < 1e-3 def test_stable_diffusion_text2img_pipeline_unflawed(self): expected_image = load_numpy( @@ -402,12 +404,13 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" - generator = torch.manual_seed(0) + generator = torch.Generator("cpu").manual_seed(0) output = pipe(prompt=prompt, guidance_scale=7.5, guidance_rescale=0.7, generator=generator, output_type="np") image = output.images[0] assert image.shape == (768, 768, 3) - assert np.abs(expected_image - image).max() < 5e-1 + max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) + assert max_diff < 1e-2 def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self): expected_image = load_numpy( @@ -426,7 +429,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): image = output.images[0] assert image.shape == (768, 768, 3) - assert np.abs(expected_image - image).max() < 7.5e-1 + max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten()) + assert max_diff < 1e-3 def test_download_local(self): filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors") @@ -460,7 +464,8 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): generator = torch.Generator(device="cpu").manual_seed(0) image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0] - assert np.max(np.abs(image - image_ckpt)) < 1e-3 + max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) + assert max_diff < 1e-3 def test_stable_diffusion_text2img_intermediate_state_v_pred(self): number_of_steps = 0 diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 0254391c43..319dcb5aab 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -24,7 +24,6 @@ from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available from diffusers.utils.testing_utils import ( CaptureLogger, - numpy_cosine_similarity_distance, require_torch, torch_device, ) @@ -304,6 +303,10 @@ class PipelineTesterMixin: def test_save_load_local(self, expected_max_difference=5e-4): components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -517,6 +520,10 @@ class PipelineTesterMixin: def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4): components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -537,18 +544,26 @@ class PipelineTesterMixin: def test_float16_inference(self, expected_max_diff=1e-2): components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) components = self.get_dummy_components() pipe_fp16 = self.pipeline_class(**components) + for component in pipe_fp16.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() + pipe_fp16.to(torch_device, torch.float16) pipe_fp16.set_progress_bar_config(disable=None) output = pipe(**self.get_dummy_inputs(torch_device))[0] output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0] - max_diff = numpy_cosine_similarity_distance(to_np(output).flatten(), to_np(output_fp16).flatten()) + max_diff = np.abs(to_np(output) - to_np(output_fp16)).max() self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.") @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") @@ -557,7 +572,11 @@ class PipelineTesterMixin: for name, module in components.items(): if hasattr(module, "half"): components[name] = module.to(torch_device).half() + pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -567,6 +586,9 @@ class PipelineTesterMixin: with tempfile.TemporaryDirectory() as tmpdir: pipe.save_pretrained(tmpdir) pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16) + for component in pipe_loaded.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe_loaded.to(torch_device) pipe_loaded.set_progress_bar_config(disable=None) @@ -579,7 +601,6 @@ class PipelineTesterMixin: inputs = self.get_dummy_inputs(torch_device) output_loaded = pipe_loaded(**inputs)[0] - max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() self.assertLess( max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading." @@ -591,6 +612,9 @@ class PipelineTesterMixin: components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -604,6 +628,9 @@ class PipelineTesterMixin: with tempfile.TemporaryDirectory() as tmpdir: pipe.save_pretrained(tmpdir, safe_serialization=False) pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) + for component in pipe_loaded.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe_loaded.to(torch_device) pipe_loaded.set_progress_bar_config(disable=None) @@ -662,6 +689,9 @@ class PipelineTesterMixin: components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -686,6 +716,9 @@ class PipelineTesterMixin: def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4): components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -714,6 +747,9 @@ class PipelineTesterMixin: components = self.get_dummy_components() pipe = self.pipeline_class(**components) + for component in pipe.components.values(): + if hasattr(component, "set_default_attn_processor"): + component.set_default_attn_processor() pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) @@ -731,7 +767,7 @@ class PipelineTesterMixin: ) if test_max_difference: - max_diff = np.abs(output_with_offload - output_without_offload).max() + max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results") if test_mean_pixel_difference: diff --git a/tests/pipelines/text_to_video/test_video_to_video.py b/tests/pipelines/text_to_video/test_video_to_video.py index 41e213c43d..9e61ddcbbd 100644 --- a/tests/pipelines/text_to_video/test_video_to_video.py +++ b/tests/pipelines/text_to_video/test_video_to_video.py @@ -150,6 +150,9 @@ class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_save_load_optional_components(self): + super().test_save_load_optional_components(expected_max_difference=0.001) + @unittest.skipIf( torch_device != "cuda" or not is_xformers_available(), reason="XFormers attention is only available with CUDA and `xformers` installed", diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index 393c3ba163..46890904a3 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -406,7 +406,7 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): @skip_mps def test_save_load_local(self): - return super().test_save_load_local() + return super().test_save_load_local(expected_max_difference=5e-3) @skip_mps def test_save_load_optional_components(self): diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index 75a2625080..2604368104 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -477,7 +477,7 @@ class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCa @skip_mps def test_save_load_local(self): - return super().test_save_load_local() + return super().test_save_load_local(expected_max_difference=4e-3) @skip_mps def test_save_load_optional_components(self):