diff --git a/src/diffusers/pipelines/unidiffuser/convert_from_ckpt.py b/src/diffusers/pipelines/unidiffuser/convert_from_ckpt.py index ac3a72da47..25fbb4b48e 100644 --- a/src/diffusers/pipelines/unidiffuser/convert_from_ckpt.py +++ b/src/diffusers/pipelines/unidiffuser/convert_from_ckpt.py @@ -613,13 +613,13 @@ def convert_caption_decoder_to_diffusers(ckpt, diffusers_model): def main(args): # Create corresponding models, hardcoded for now. vae_config = create_vae_diffusers_config(args) - AutoencoderKL(**vae_config) + vae = AutoencoderKL(**vae_config) unet_config = create_unidiffuser_unet_config(args) unet = UniDiffuserModel(**unet_config) text_decoder_config = create_text_decoder_config(args) - UniDiffuserTextDecoder(**text_decoder_config) + text_decoder = UniDiffuserTextDecoder(**text_decoder_config) print("Converting VAE checkpoint...") vae = convert_vae_to_diffusers(args.vae_ckpt, vae) diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 5ba6be5d6d..9e44e7d34b 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -151,7 +151,7 @@ class UniDiffuserPipeline(DiffusionPipeline): self.text_encoder_hidden_size = text_encoder.config.hidden_size self.image_encoder_hidden_size = image_encoder.config.hidden_size - self.text_intermediate_dim = 0 + self.text_intermediate_dim = self.text_encoder_hidden_size if self.text_decoder.prefix_hidden_dim is not None: self.text_intermediate_dim = self.text_decoder.prefix_hidden_dim @@ -934,7 +934,7 @@ class UniDiffuserPipeline(DiffusionPipeline): mode = self._infer_mode(prompt, prompt_embeds, image, prompt_latents, vae_latents, clip_latents) batch_size = self._infer_batch_size(mode, prompt, prompt_embeds, image, num_samples) device = self._execution_device - reduce_text_emb_dim = self.text_intermediate_dim < self.text_encoder_hidden_size or self.mode != "t2i" + reduce_text_emb_dim = self.text_intermediate_dim < self.text_encoder_hidden_size or self.mode != "text2img" # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`