1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00

correct unconditional image

This commit is contained in:
Patrick von Platen
2022-11-23 12:08:46 +00:00
parent 22c6b32672
commit 8d4207ddd2
2 changed files with 15 additions and 5 deletions

View File

@@ -219,6 +219,7 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline):
batch_size = len(prompt) if isinstance(prompt, list) else 1
# get prompt text embeddings
# prompt = [(np.asarray(prompt) / 255)]
image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
image_embeddings = self.image_encoder(image_input.pixel_values.to(self.device))
image_embeddings = normalize_embeddings(image_embeddings)
@@ -232,7 +233,7 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline):
if do_classifier_free_guidance:
uncond_images: List[str]
if negative_prompt is None:
uncond_images = [np.zeros((512, 512, 3))] * batch_size
uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
elif type(prompt) is not type(negative_prompt):
raise TypeError(
f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
@@ -430,16 +431,22 @@ class VersatileDiffusionImageToTextPipeline(DiffusionPipeline):
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
print("latent_model_input", latent_model_input.abs().sum())
print("timestep", t)
# predict the noise residual
noise_pred = self.text_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
# perform guidance
if do_classifier_free_guidance:
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
print("e_t", noise_pred.abs().sum())
print("e_t[3,3]", noise_pred[0, :5, 0, 0])
# compute the previous noisy sample x_t -> x_t-1
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
print("latents", latents.abs().sum())
# call the callback, if provided
if callback is not None and i % callback_steps == 0:

View File

@@ -18,7 +18,7 @@ import unittest
import numpy as np
import torch
from diffusers import VersatileDiffusionImageToTextPipeline
from diffusers import VersatileDiffusionImageToTextPipeline, DDIMScheduler
from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
from ...test_pipelines_common import PipelineTesterMixin
@@ -42,10 +42,13 @@ class VersatileDiffusionImageToTextPipelineIntegrationTests(unittest.TestCase):
image_prompt = load_image(
"https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/boy_and_girl.jpg"
)
generator = torch.Generator(device=torch_device).manual_seed(0)
# generator = torch.Generator(device=torch_device).manual_seed(0)
np.random.seed(8)
torch.manual_seed(108)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
text = pipe(
image=image_prompt,
generator=generator,
# generator=generator,
guidance_scale=7.5,
num_inference_steps=50,
output_type="str",