diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 36f914f0b5..d0c991ba3a 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -446,6 +446,7 @@ class HunyuanVideoTokenRefiner(nn.Module): else: original_dtype = hidden_states.dtype mask_float = attention_mask.float().unsqueeze(-1) + __import__("ipdb").set_trace() pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1) pooled_projections = pooled_projections.to(original_dtype) diff --git a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py index d3c8a3539b..18a0e970c6 100644 --- a/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py +++ b/src/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py @@ -344,7 +344,7 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader ) prompt_embeds = self.text_encoder( **expanded_inputs, - pixel_value=image_embeds, + pixel_values=image_embeds, output_hidden_states=True, ).hidden_states[-(num_hidden_layers_to_skip + 1)] prompt_embeds = prompt_embeds.to(dtype=dtype) diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py index 5802bde87a..16b196929b 100644 --- a/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py +++ b/tests/pipelines/hunyuan_video/test_hunyuan_image2video.py @@ -23,10 +23,13 @@ from transformers import ( CLIPTextConfig, CLIPTextModel, CLIPTokenizer, + LlavaForConditionalGeneration, + LlavaConfig, LlamaConfig, LlamaModel, LlamaTokenizer, ) +from transformers.models.clip import CLIPVisionConfig from diffusers import ( AutoencoderKLHunyuanVideo, @@ -116,7 +119,7 @@ class HunyuanVideoImageToVideoPipelineFastTests( torch.manual_seed(0) scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0) - llama_text_encoder_config = LlamaConfig( + text_config = LlamaConfig( bos_token_id=0, eos_token_id=2, hidden_size=16, @@ -129,6 +132,18 @@ class HunyuanVideoImageToVideoPipelineFastTests( hidden_act="gelu", projection_dim=32, ) + vision_config = CLIPVisionConfig( + hidden_size=8, + intermediate_size=37, + projection_dim=32, + num_attention_heads=4, + num_hidden_layers=2, + image_size=224, + ) + llava_text_encoder_config = LlavaConfig( + vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8 + ) + clip_text_encoder_config = CLIPTextConfig( bos_token_id=0, eos_token_id=2, @@ -144,7 +159,7 @@ class HunyuanVideoImageToVideoPipelineFastTests( ) torch.manual_seed(0) - text_encoder = LlamaModel(llama_text_encoder_config) + text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config) tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer") torch.manual_seed(0) @@ -153,14 +168,14 @@ class HunyuanVideoImageToVideoPipelineFastTests( torch.manual_seed(0) image_processor = CLIPImageProcessor( - crop_size=336, + crop_size=224, do_center_crop=True, do_normalize=True, do_resize=True, image_mean=[0.48145466, 0.4578275, 0.40821073], image_std=[0.26862954, 0.26130258, 0.27577711], resample=3, - size=336, + size=224, ) components = { @@ -188,8 +203,21 @@ class HunyuanVideoImageToVideoPipelineFastTests( "image": image, "prompt": "dance monkey", "prompt_template": { - "template": "{}", - "crop_start": 0, + "template": ( + "<|start_header_id|>system<|end_header_id|>\n\n\nDescribe the video by detailing the following aspects according to the reference image: " + "1. The main content and theme of the video." + "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." + "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." + "4. background environment, light, style and atmosphere." + "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" + "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + ), + "crop_start": 5, + "image_emb_len": 49, + "image_emb_start": 5, + "image_emb_end": 54, + "double_return_token_id": 10, }, "generator": generator, "num_inference_steps": 2, @@ -197,7 +225,7 @@ class HunyuanVideoImageToVideoPipelineFastTests( "height": image_height, "width": image_width, "num_frames": 9, - "max_sequence_length": 16, + "max_sequence_length": 64, "output_type": "pt", } return inputs