diff --git a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py index 378f557023..3464853add 100644 --- a/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py +++ b/src/diffusers/pipelines/hunyuan_video1_5/pipeline_hunyuan_video1_5.py @@ -227,13 +227,14 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): self.video_processor = HunyuanVideo15ImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial) self.target_size = self.transformer.config.target_size if getattr(self, "transformer", None) else 640 self.vision_states_dim = self.transformer.config.image_embed_dim if getattr(self, "transformer", None) else 1152 + self.num_channels_latents = self.vae.latent_channels if hasattr(self, "vae") else 32 # fmt: off - self.system_message ="You are a helpful assistant. Describe the video by detailing the following aspects: \ - 1. The main content and theme of the video. \ - 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ - 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ - 4. background environment, light, style and atmosphere. \ - 5. camera angles, movements, and transitions used in the video." + self.system_message = "You are a helpful assistant. Describe the video by detailing the following aspects: \ + 1. The main content and theme of the video. \ + 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ + 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ + 4. background environment, light, style and atmosphere. \ + 5. camera angles, movements, and transitions used in the video." # fmt: on self.prompt_template_encode_start_idx = 108 self.tokenizer_max_length = 1000 @@ -253,11 +254,11 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): num_hidden_layers_to_skip: int = 2, # fmt: off system_message: str = "You are a helpful assistant. Describe the video by detailing the following aspects: \ - 1. The main content and theme of the video. \ - 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ - 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ - 4. background environment, light, style and atmosphere. \ - 5. camera angles, movements, and transitions used in the video.", + 1. The main content and theme of the video. \ + 2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects. \ + 3. Actions, events, behaviors temporal relationships, physical movement changes of the objects. \ + 4. background environment, light, style and atmosphere. \ + 5. camera angles, movements, and transitions used in the video.", # fmt: on crop_start: int = 108, ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -286,12 +287,13 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): attention_mask=prompt_attention_mask, output_hidden_states=True, ).hidden_states[-(num_hidden_layers_to_skip + 1)] - prompt_embeds = prompt_embeds.to(dtype=dtype) if crop_start is not None and crop_start > 0: prompt_embeds = prompt_embeds[:, crop_start:] prompt_attention_mask = prompt_attention_mask[:, crop_start:] + prompt_embeds = prompt_embeds.to(dtype=dtype) + return prompt_embeds, prompt_attention_mask @@ -578,7 +580,7 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): negative_prompt: Union[str, List[str]] = None, height: Optional[int] = None, width: Optional[int] = None, - num_frames: int = 129, + num_frames: int = 121, num_inference_steps: int = 50, sigmas: List[float] = None, num_videos_per_prompt: Optional[int] = 1, @@ -752,10 +754,9 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, sigmas=sigmas) # 5. Prepare latent variables - num_channels_latents = self.transformer.config.in_channels latents = self.prepare_latents( batch_size * num_videos_per_prompt, - num_channels_latents, + self.num_channels_latents, height, width, num_frames, @@ -877,7 +878,7 @@ class HunyuanVideo15Pipeline(DiffusionPipeline): if not output_type == "latent": latents = latents.to(self.vae.dtype) / self.vae.config.scaling_factor - video = self.vae.decode(latents, return_dict=False, generator=generator)[0] + video = self.vae.decode(latents, return_dict=False)[0] video = self.video_processor.postprocess_video(video, output_type=output_type) else: video = latents