diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_to_text.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_to_text.py index 129134a479..40c432f44f 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_to_text.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_to_text.py @@ -219,6 +219,7 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline): batch_size = len(prompt) if isinstance(prompt, list) else 1 # get prompt text embeddings + # prompt = [(np.asarray(prompt) / 255)] image_input = self.image_feature_extractor(images=prompt, return_tensors="pt") image_embeddings = self.image_encoder(image_input.pixel_values.to(self.device)) image_embeddings = normalize_embeddings(image_embeddings) @@ -232,7 +233,7 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline): if do_classifier_free_guidance: uncond_images: List[str] if negative_prompt is None: - uncond_images = [np.zeros((512, 512, 3))] * batch_size + uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size elif type(prompt) is not type(negative_prompt): raise TypeError( f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" @@ -430,16 +431,22 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline): latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + print("latent_model_input", latent_model_input.abs().sum()) + print("timestep", t) + # predict the noise residual noise_pred = self.text_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample - # perform guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + print("e_t", noise_pred.abs().sum()) + print("e_t[3,3]", noise_pred[0, :5, 0, 0]) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + print("latents", latents.abs().sum()) # call the callback, if provided if callback is not None and i % callback_steps == 0: diff --git a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_to_text.py b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_to_text.py index dbaaeeb262..f03535692e 100644 --- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_to_text.py +++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_to_text.py @@ -18,7 +18,7 @@ import unittest import numpy as np import torch -from diffusers import VersatileDiffusionImageToTextPipeline +from diffusers import VersatileDiffusionImageToTextPipeline, DDIMScheduler from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device from ...test_pipelines_common import PipelineTesterMixin @@ -42,10 +42,13 @@ class VersatileDiffusionImageToTextPipelineIntegrationTests(unittest.TestCase): image_prompt = load_image( "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/boy_and_girl.jpg" ) - generator = torch.Generator(device=torch_device).manual_seed(0) + # generator = torch.Generator(device=torch_device).manual_seed(0) + np.random.seed(8) + torch.manual_seed(108) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) text = pipe( image=image_prompt, - generator=generator, + # generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="str",