diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 5695429489..682b18d5cc 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -966,7 +966,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) encoder_attention_mask=encoder_attention_mask, ) # To support T2I-Adapter-XL - if is_adapter and len(down_block_additional_residuals) > 0: + if is_adapter and len(down_block_additional_residuals) > 0 and sample.shape == down_block_additional_residuals[0].shape: sample += down_block_additional_residuals.pop(0) if is_controlnet: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py index fe088a80e2..1d5e102496 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_adapter.py @@ -46,34 +46,24 @@ class AdapterTests: def get_dummy_components(self, adapter_type): torch.manual_seed(0) - if adapter_type == 'light_adapter': - channels = [32, 32, 32] - else: - channels = [32, 32, 32, 32] - torch.manual_seed(0) unet = UNet2DConditionModel( - block_out_channels=[32, 32, 32, 32], + block_out_channels=(32, 64), layers_per_block=2, sample_size=32, in_channels=4, out_channels=4, - down_block_types=( - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "CrossAttnDownBlock2D", - "DownBlock2D", - ), - up_block_types= ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), cross_attention_dim=32, ) scheduler = PNDMScheduler(skip_prk_steps=True) torch.manual_seed(0) vae = AutoencoderKL( - block_out_channels=[32, 32, 32, 32], + block_out_channels=[32, 64], in_channels=3, out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], latent_channels=4, ) torch.manual_seed(0) @@ -94,9 +84,9 @@ class AdapterTests: torch.manual_seed(0) adapter = T2IAdapter( in_channels=3, - channels=channels, + channels=[32, 64], num_res_blocks=2, - downscale_factor=8, + downscale_factor=2, adapter_type=adapter_type, ) @@ -155,8 +145,11 @@ class StableDiffusionFullAdapterPipelineFastTests(AdapterTests, PipelineTesterMi inputs = self.get_dummy_inputs(device) image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.4858, 0.5500, 0.4278, 0.4669, 0.6184, 0.4322, 0.5010, 0.5033, 0.4746]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 class StableDiffusionLightAdapterPipelineFastTests(AdapterTests, PipelineTesterMixin, unittest.TestCase): @@ -172,8 +165,11 @@ class StableDiffusionLightAdapterPipelineFastTests(AdapterTests, PipelineTesterM inputs = self.get_dummy_inputs(device) image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.4965, 0.5548, 0.4330, 0.4771, 0.6226, 0.4382, 0.5037, 0.5071, 0.4782]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 @slow @@ -317,4 +313,4 @@ class StableDiffusionAdapterPipelineSlowTests(unittest.TestCase): pipe(prompt="foo", image=image, num_inference_steps=2) mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes < 5 * 10**9 + assert mem_bytes < 5 * 10**9 \ No newline at end of file