diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py deleted file mode 100644 index 00918bf7ba..0000000000 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ /dev/null @@ -1,283 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel -from diffusers.utils.testing_utils import ( - enable_full_determinism, - floats_tensor, - load_image, - load_numpy, - nightly, - require_torch_gpu, - skip_mps, - torch_device, -) - -from ..pipeline_params import ( - IMAGE_TO_IMAGE_IMAGE_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS, -) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): - pipeline_class = CycleDiffusionPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { - "negative_prompt", - "height", - "width", - "negative_prompt_embeds", - } - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) - image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - num_train_timesteps=1000, - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image / 2 + 0.5 - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "An astronaut riding an elephant", - "source_prompt": "An astronaut riding a horse", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "eta": 0.1, - "strength": 0.8, - "guidance_scale": 3, - "source_guidance_scale": 1, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_cycle(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = CycleDiffusionPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - images = output.images - - image_slice = images[0, -3:, -3:, -1] - - assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4459, 0.4943, 0.4544, 0.6643, 0.5474, 0.4327, 0.5701, 0.5959, 0.5179]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_cycle_fp16(self): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.half() - pipe = CycleDiffusionPipeline(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs) - images = output.images - - image_slice = images[0, -3:, -3:, -1] - - assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([0.3506, 0.4543, 0.446, 0.4575, 0.5195, 0.4155, 0.5273, 0.518, 0.4116]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @unittest.skip("non-deterministic pipeline") - def test_inference_batch_single_identical(self): - return super().test_inference_batch_single_identical() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - -@nightly -@require_torch_gpu -class CycleDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_cycle_diffusion_pipeline_fp16(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/cycle-diffusion/black_colored_car.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car_fp16.npy" - ) - init_image = init_image.resize((512, 512)) - - model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = CycleDiffusionPipeline.from_pretrained( - model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16" - ) - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - source_prompt = "A black colored car" - prompt = "A blue colored car" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - source_prompt=source_prompt, - image=init_image, - num_inference_steps=100, - eta=0.1, - strength=0.85, - guidance_scale=3, - source_guidance_scale=1, - generator=generator, - output_type="np", - ) - image = output.images - - # the values aren't exactly equal, but the images look the same visually - assert np.abs(image - expected_image).max() < 5e-1 - - def test_cycle_diffusion_pipeline(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/cycle-diffusion/black_colored_car.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car.npy" - ) - init_image = init_image.resize((512, 512)) - - model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None) - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - source_prompt = "A black colored car" - prompt = "A blue colored car" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - source_prompt=source_prompt, - image=init_image, - num_inference_steps=100, - eta=0.1, - strength=0.85, - guidance_scale=3, - source_guidance_scale=1, - generator=generator, - output_type="np", - ) - image = output.images - - assert np.abs(image - expected_image).max() < 2e-2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py deleted file mode 100644 index 45563cdb79..0000000000 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ /dev/null @@ -1,630 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionInpaintPipelineLegacy, - UNet2DConditionModel, - UNet2DModel, - VQModel, -) -from diffusers.utils.testing_utils import ( - enable_full_determinism, - floats_tensor, - load_image, - load_numpy, - nightly, - preprocess_image, - require_torch_gpu, - slow, - torch_device, -) - - -enable_full_determinism() - - -class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_cond_unet_inpaint(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vq_model(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_stable_diffusion_inpaint_legacy(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_batched(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - init_images_tens = preprocess_image(init_image, batch_size=2) - init_masks_tens = init_images_tens + 4 - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - images = sd_pipe( - [prompt] * 2, - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_images_tens, - mask_image=init_masks_tens, - ).images - - assert images.shape == (2, 32, 32, 3) - - image_slice_0 = images[0, -3:, -3:, -1].flatten() - image_slice_1 = images[1, -3:, -3:, -1].flatten() - - expected_slice_0 = np.array([0.4697, 0.3770, 0.4096, 0.4653, 0.4497, 0.4183, 0.3950, 0.4668, 0.4672]) - expected_slice_1 = np.array([0.4105, 0.4987, 0.5771, 0.4921, 0.4237, 0.5684, 0.5496, 0.4645, 0.5272]) - - assert np.abs(expected_slice_0 - image_slice_0).max() < 1e-2 - assert np.abs(expected_slice_1 - image_slice_1).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - negative_prompt = "french fries" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - prompt, - negative_prompt=negative_prompt, - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): - device = "cpu" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - # test num_images_per_prompt=1 (default) - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (1, 32, 32, 3) - - # test num_images_per_prompt=1 (default) for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (batch_size, 32, 32, 3) - - # test num_images_per_prompt for single prompt - num_images_per_prompt = 2 - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (num_images_per_prompt, 32, 32, 3) - - # test num_images_per_prompt for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) - - -@slow -@require_torch_gpu -class StableDiffusionInpaintLegacyPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, generator_device="cpu", seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "A red cat sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_inpaint_legacy_pndm(self): - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.5665, 0.6117, 0.6430, 0.4057, 0.4594, 0.5658, 0.1596, 0.3106, 0.4305]) - - assert np.abs(expected_slice - image_slice).max() < 3e-3 - - def test_stable_diffusion_inpaint_legacy_batched(self): - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - inputs["prompt"] = [inputs["prompt"]] * 2 - inputs["image"] = preprocess_image(inputs["image"], batch_size=2) - - mask = inputs["mask_image"].convert("L") - mask = np.array(mask).astype(np.float32) / 255.0 - mask = torch.from_numpy(1 - mask) - masks = torch.vstack([mask[None][None]] * 2) - inputs["mask_image"] = masks - - image = pipe(**inputs).images - assert image.shape == (2, 512, 512, 3) - - image_slice_0 = image[0, 253:256, 253:256, -1].flatten() - image_slice_1 = image[1, 253:256, 253:256, -1].flatten() - - expected_slice_0 = np.array( - [0.52093095, 0.4176447, 0.32752383, 0.6175223, 0.50563973, 0.36470804, 0.65460044, 0.5775188, 0.44332123] - ) - expected_slice_1 = np.array( - [0.3592432, 0.4233033, 0.3914635, 0.31014425, 0.3702293, 0.39412856, 0.17526966, 0.2642669, 0.37480092] - ) - - assert np.abs(expected_slice_0 - image_slice_0).max() < 3e-3 - assert np.abs(expected_slice_1 - image_slice_1).max() < 3e-3 - - def test_stable_diffusion_inpaint_legacy_k_lms(self): - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4534, 0.4467, 0.4329, 0.4329, 0.4339, 0.4220, 0.4244, 0.4332, 0.4426]) - - assert np.abs(expected_slice - image_slice).max() < 3e-3 - - def test_stable_diffusion_inpaint_legacy_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.5977, 1.5449, 1.0586, -0.3250, 0.7383, -0.0862, 0.4631, -0.2571, -1.1289]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.5190, 1.1621, 0.6885, 0.2424, 0.3337, -0.1617, 0.6914, -0.1957, -0.5474]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - - callback_fn.has_been_called = False - - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 2 - - -@nightly -@require_torch_gpu -class StableDiffusionInpaintLegacyPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "A red cat sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 50, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_inpaint_pndm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_ddim(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_lms(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_dpm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 30 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py deleted file mode 100644 index 27c6a65b63..0000000000 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ /dev/null @@ -1,255 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - PNDMScheduler, - StableDiffusionModelEditingPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, skip_mps, torch_device - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -@skip_mps -class StableDiffusionModelEditingPipelineFastTests( - PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase -): - pipeline_class = StableDiffusionModelEditingPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "A field of roses", - "generator": generator, - # Setting height and width to None to prevent OOMs on CPU. - "height": None, - "width": None, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_model_editing_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4755, 0.5132, 0.4976, 0.3904, 0.3554, 0.4765, 0.5139, 0.5158, 0.4889]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4992, 0.5101, 0.5004, 0.3949, 0.3604, 0.4735, 0.5216, 0.5204, 0.4913]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4747, 0.5372, 0.4779, 0.4982, 0.5543, 0.4816, 0.5238, 0.4904, 0.5027]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - # the pipeline does not expect pndm so test if it raises error. - with self.assertRaises(ValueError): - _ = sd_pipe(**inputs).images - - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(expected_max_diff=5e-3) - - def test_attention_slicing_forward_pass(self): - super().test_attention_slicing_forward_pass(expected_max_diff=5e-3) - - -@nightly -@require_torch_gpu -class StableDiffusionModelEditingSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "A field of roses", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_model_editing_default(self): - model_ckpt = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - expected_slice = np.array( - [0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 - - # make sure image changes after editing - pipe.edit_model("A pack of roses", "A pack of blue roses") - - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(expected_slice - image_slice).max() > 1e-1 - - def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - model_ckpt = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionModelEditingPipeline.from_pretrained( - model_ckpt, scheduler=scheduler, safety_checker=None - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 4.4 GB is allocated - assert mem_bytes < 4.4 * 10**9 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py deleted file mode 100644 index ae9bc83fe0..0000000000 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py +++ /dev/null @@ -1,228 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMParallelScheduler, - DDPMParallelScheduler, - StableDiffusionParadigmsPipeline, - UNet2DConditionModel, -) -from diffusers.utils.testing_utils import ( - enable_full_determinism, - nightly, - require_torch_gpu, - torch_device, -) - -from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin - - -enable_full_determinism() - - -class StableDiffusionParadigmsPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionParadigmsPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = DDIMParallelScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "generator": generator, - "num_inference_steps": 10, - "guidance_scale": 6.0, - "output_type": "numpy", - "parallel": 3, - "debug": True, - } - return inputs - - def test_stable_diffusion_paradigms_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionParadigmsPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4773, 0.5417, 0.4723, 0.4925, 0.5631, 0.4752, 0.5240, 0.4935, 0.5023]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_paradigms_default_case_ddpm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - torch.manual_seed(0) - components["scheduler"] = DDPMParallelScheduler() - torch.manual_seed(0) - sd_pipe = StableDiffusionParadigmsPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.3573, 0.4420, 0.4960, 0.4799, 0.3796, 0.3879, 0.4819, 0.4365, 0.4468]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # override to speed the overall test timing up. - def test_inference_batch_consistent(self): - super().test_inference_batch_consistent(batch_sizes=[1, 2]) - - # override to speed the overall test timing up. - def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3e-3) - - def test_stable_diffusion_paradigms_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionParadigmsPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.4771, 0.5420, 0.4683, 0.4918, 0.5636, 0.4725, 0.5230, 0.4923, 0.5015]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@nightly -@require_torch_gpu -class StableDiffusionParadigmsPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, seed=0): - generator = torch.Generator(device=torch_device).manual_seed(seed) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "generator": generator, - "num_inference_steps": 10, - "guidance_scale": 7.5, - "output_type": "numpy", - "parallel": 3, - "debug": True, - } - return inputs - - def test_stable_diffusion_paradigms_default(self): - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMParallelScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionParadigmsPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - expected_slice = np.array([0.9622, 0.9602, 0.9748, 0.9591, 0.9630, 0.9691, 0.9661, 0.9631, 0.9741]) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py deleted file mode 100644 index 2b4dc98abd..0000000000 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ /dev/null @@ -1,590 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMInverseScheduler, - DDIMScheduler, - DDPMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, - StableDiffusionPix2PixZeroPipeline, - UNet2DConditionModel, -) -from diffusers.image_processor import VaeImageProcessor -from diffusers.utils.testing_utils import ( - enable_full_determinism, - floats_tensor, - load_image, - load_numpy, - load_pt, - nightly, - require_torch_gpu, - skip_mps, - torch_device, -) - -from ..pipeline_params import ( - TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, - TEXT_GUIDED_IMAGE_VARIATION_PARAMS, - TEXT_TO_IMAGE_IMAGE_PARAMS, -) -from ..test_pipelines_common import ( - PipelineLatentTesterMixin, - PipelineTesterMixin, - assert_mean_pixel_difference, -) - - -enable_full_determinism() - - -@skip_mps -class StableDiffusionPix2PixZeroPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionPix2PixZeroPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"image"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = TEXT_TO_IMAGE_IMAGE_PARAMS - image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS - - @classmethod - def setUpClass(cls): - cls.source_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt" - ) - - cls.target_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt" - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - inverse_scheduler = DDIMInverseScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "inverse_scheduler": inverse_scheduler, - "caption_generator": None, - "caption_processor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "cross_attention_guidance_amount": 0.15, - "source_embeds": self.source_embeds, - "target_embeds": self.target_embeds, - "output_type": "numpy", - } - return inputs - - def get_dummy_inversion_inputs(self, device, seed=0): - dummy_image = floats_tensor((2, 3, 32, 32), rng=random.Random(seed)).to(torch_device) - dummy_image = dummy_image / 2 + 0.5 - generator = torch.manual_seed(seed) - - inputs = { - "prompt": [ - "A painting of a squirrel eating a burger", - "A painting of a burger eating a squirrel", - ], - "image": dummy_image.cpu(), - "num_inference_steps": 2, - "guidance_scale": 6.0, - "generator": generator, - "output_type": "numpy", - } - return inputs - - def get_dummy_inversion_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"): - inputs = self.get_dummy_inversion_inputs(device, seed) - - if input_image_type == "pt": - image = inputs["image"] - elif input_image_type == "np": - image = VaeImageProcessor.pt_to_numpy(inputs["image"]) - elif input_image_type == "pil": - image = VaeImageProcessor.pt_to_numpy(inputs["image"]) - image = VaeImageProcessor.numpy_to_pil(image) - else: - raise ValueError(f"unsupported input_image_type {input_image_type}") - - inputs["image"] = image - inputs["output_type"] = output_type - - return inputs - - def test_save_load_optional_components(self): - if not hasattr(self.pipeline_class, "_optional_components"): - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - # set all optional components to None and update pipeline config accordingly - for optional_component in pipe._optional_components: - setattr(pipe, optional_component, None) - pipe.register_modules(**{optional_component: None for optional_component in pipe._optional_components}) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for optional_component in pipe._optional_components: - self.assertTrue( - getattr(pipe_loaded, optional_component) is None, - f"`{optional_component}` did not stay set to None after loading.", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(output - output_loaded).max() - self.assertLess(max_diff, 1e-4) - - def test_stable_diffusion_pix2pix_zero_inversion(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inversion_inputs(device) - inputs["image"] = inputs["image"][:1] - inputs["prompt"] = inputs["prompt"][:1] - image = sd_pipe.invert(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4732, 0.4630, 0.5722, 0.5103, 0.5140, 0.5622, 0.5104, 0.5390, 0.5020]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_inversion_batch(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inversion_inputs(device) - image = sd_pipe.invert(**inputs).images - image_slice = image[1, -3:, -3:, -1] - assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([0.6046, 0.5400, 0.4902, 0.4448, 0.4694, 0.5498, 0.4857, 0.5073, 0.5089]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4863, 0.5053, 0.5033, 0.4007, 0.3571, 0.4768, 0.5176, 0.5277, 0.4940]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5177, 0.5097, 0.5047, 0.4076, 0.3667, 0.4767, 0.5238, 0.5307, 0.4958]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5421, 0.5525, 0.6085, 0.5279, 0.4658, 0.5317, 0.4418, 0.4815, 0.5132]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_ddpm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = DDPMScheduler() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4861, 0.5053, 0.5038, 0.3994, 0.3562, 0.4768, 0.5172, 0.5280, 0.4938]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_inversion_pt_np_pil_outputs_equivalent(self): - device = torch_device - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - output_pt = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="pt")).images - output_np = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="np")).images - output_pil = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="pil")).images - - max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() - self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`") - - max_diff = np.abs(np.array(output_pil[0]) - (output_np[0] * 255).round()).max() - self.assertLess(max_diff, 2.0, "`output_type=='pil'` generate different results from `output_type=='np'`") - - def test_stable_diffusion_pix2pix_zero_inversion_pt_np_pil_inputs_equivalent(self): - device = torch_device - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - out_input_pt = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, input_image_type="pt")).images - out_input_np = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, input_image_type="np")).images - out_input_pil = sd_pipe.invert( - **self.get_dummy_inversion_inputs_by_type(device, input_image_type="pil") - ).images - - max_diff = np.abs(out_input_pt - out_input_np).max() - self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`") - - assert_mean_pixel_difference(out_input_pil, out_input_np, expected_max_diff=1) - - # Non-determinism caused by the scheduler optimizing the latent inputs during inference - @unittest.skip("non-deterministic pipeline") - def test_inference_batch_single_identical(self): - return super().test_inference_batch_single_identical() - - -@nightly -@require_torch_gpu -class StableDiffusionPix2PixZeroPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @classmethod - def setUpClass(cls): - cls.source_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt" - ) - - cls.target_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt" - ) - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - - inputs = { - "prompt": "turn him into a cyborg", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "cross_attention_guidance_amount": 0.15, - "source_embeds": self.source_embeds, - "target_embeds": self.target_embeds, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_pix2pix_zero_default(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.5742, 0.5757, 0.5747, 0.5781, 0.5688, 0.5713, 0.5742, 0.5664, 0.5747]) - - assert np.abs(expected_slice - image_slice).max() < 5e-2 - - def test_stable_diffusion_pix2pix_zero_k_lms(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.6367, 0.5459, 0.5146, 0.5479, 0.4905, 0.4753, 0.4961, 0.4629, 0.4624]) - - assert np.abs(expected_slice - image_slice).max() < 5e-2 - - def test_stable_diffusion_pix2pix_zero_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.1345, 0.268, 0.1539, 0.0726, 0.0959, 0.2261, -0.2673, 0.0277, -0.2062]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.1393, 0.2637, 0.1617, 0.0724, 0.0987, 0.2271, -0.2666, 0.0299, -0.2104]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 3 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 8.2 GB is allocated - assert mem_bytes < 8.2 * 10**9 - - -@nightly -@require_torch_gpu -class InversionPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @classmethod - def setUpClass(cls): - raw_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png" - ) - - raw_image = raw_image.convert("RGB").resize((512, 512)) - - cls.raw_image = raw_image - - def test_stable_diffusion_pix2pix_inversion(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10) - inv_latents = output[0] - - image_slice = inv_latents[0, -3:, -3:, -1].flatten() - - assert inv_latents.shape == (1, 4, 64, 64) - expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666]) - - assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2 - - def test_stable_diffusion_2_pix2pix_inversion(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10) - inv_latents = output[0] - - image_slice = inv_latents[0, -3:, -3:, -1].flatten() - - assert inv_latents.shape == (1, 4, 64, 64) - expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050]) - - assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2 - - def test_stable_diffusion_2_pix2pix_full(self): - # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog_2.png - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog_2.npy" - ) - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator) - inv_latents = output[0] - - source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"] - target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"] - - source_embeds = pipe.get_embeds(source_prompts) - target_embeds = pipe.get_embeds(target_prompts) - - image = pipe( - caption, - source_embeds=source_embeds, - target_embeds=target_embeds, - num_inference_steps=125, - cross_attention_guidance_amount=0.015, - generator=generator, - latents=inv_latents, - negative_prompt=caption, - output_type="np", - ).images - - mean_diff = np.abs(expected_image - image).mean() - assert mean_diff < 0.25 diff --git a/tests/pipelines/stable_diffusion_adapter/__init__.py b/tests/pipelines/stable_diffusion_adapter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py rename to tests/pipelines/stable_diffusion_adapter/test_stable_diffusion_adapter.py diff --git a/tests/pipelines/stable_diffusion_gligen/__init__.py b/tests/pipelines/stable_diffusion_gligen/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen.py b/tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_gligen.py rename to tests/pipelines/stable_diffusion_gligen/test_stable_diffusion_gligen.py diff --git a/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py b/tests/pipelines/stable_diffusion_gligen_text_image/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py b/tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py rename to tests/pipelines/stable_diffusion_gligen_text_image/test_stable_diffusion_gligen_text_image.py diff --git a/tests/pipelines/stable_diffusion_image_variation/__init__.py b/tests/pipelines/stable_diffusion_image_variation/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py rename to tests/pipelines/stable_diffusion_image_variation/test_stable_diffusion_image_variation.py diff --git a/tests/pipelines/stable_diffusion_k_diffusion/__init__.py b/tests/pipelines/stable_diffusion_k_diffusion/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py rename to tests/pipelines/stable_diffusion_k_diffusion/test_stable_diffusion_k_diffusion.py diff --git a/tests/pipelines/stable_diffusion_ldm3d/__init__.py b/tests/pipelines/stable_diffusion_ldm3d/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py rename to tests/pipelines/stable_diffusion_ldm3d/test_stable_diffusion_ldm3d.py diff --git a/tests/pipelines/stable_diffusion_panorama/__init__.py b/tests/pipelines/stable_diffusion_panorama/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py rename to tests/pipelines/stable_diffusion_panorama/test_stable_diffusion_panorama.py diff --git a/tests/pipelines/stable_diffusion_sag/__init__.py b/tests/pipelines/stable_diffusion_sag/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py similarity index 100% rename from tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py rename to tests/pipelines/stable_diffusion_sag/test_stable_diffusion_sag.py diff --git a/utils/fetch_torch_cuda_pipeline_test_matrix.py b/utils/fetch_torch_cuda_pipeline_test_matrix.py index ed34839f51..744201cb2d 100644 --- a/utils/fetch_torch_cuda_pipeline_test_matrix.py +++ b/utils/fetch_torch_cuda_pipeline_test_matrix.py @@ -15,6 +15,7 @@ ALWAYS_TEST_PIPELINE_MODULES = [ "stable_diffusion", "stable_diffusion_2", "stable_diffusion_xl", + "stable_diffusion_adapter", "deepfloyd_if", "ip_adapters", "kandinsky",