1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-29 07:22:12 +03:00
This commit is contained in:
Dhruv Nair
2025-04-16 10:26:26 +02:00
parent 7212f35de2
commit 42c27dbfc0
3 changed files with 37 additions and 8 deletions

View File

@@ -446,6 +446,7 @@ class HunyuanVideoTokenRefiner(nn.Module):
else:
original_dtype = hidden_states.dtype
mask_float = attention_mask.float().unsqueeze(-1)
__import__("ipdb").set_trace()
pooled_projections = (hidden_states * mask_float).sum(dim=1) / mask_float.sum(dim=1)
pooled_projections = pooled_projections.to(original_dtype)

View File

@@ -344,7 +344,7 @@ class HunyuanVideoImageToVideoPipeline(DiffusionPipeline, HunyuanVideoLoraLoader
)
prompt_embeds = self.text_encoder(
**expanded_inputs,
pixel_value=image_embeds,
pixel_values=image_embeds,
output_hidden_states=True,
).hidden_states[-(num_hidden_layers_to_skip + 1)]
prompt_embeds = prompt_embeds.to(dtype=dtype)

View File

@@ -23,10 +23,13 @@ from transformers import (
CLIPTextConfig,
CLIPTextModel,
CLIPTokenizer,
LlavaForConditionalGeneration,
LlavaConfig,
LlamaConfig,
LlamaModel,
LlamaTokenizer,
)
from transformers.models.clip import CLIPVisionConfig
from diffusers import (
AutoencoderKLHunyuanVideo,
@@ -116,7 +119,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
torch.manual_seed(0)
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
llama_text_encoder_config = LlamaConfig(
text_config = LlamaConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=16,
@@ -129,6 +132,18 @@ class HunyuanVideoImageToVideoPipelineFastTests(
hidden_act="gelu",
projection_dim=32,
)
vision_config = CLIPVisionConfig(
hidden_size=8,
intermediate_size=37,
projection_dim=32,
num_attention_heads=4,
num_hidden_layers=2,
image_size=224,
)
llava_text_encoder_config = LlavaConfig(
vision_config, text_config, image_seq_length=7, pad_token_id=1, image_token_index=8
)
clip_text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
@@ -144,7 +159,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
)
torch.manual_seed(0)
text_encoder = LlamaModel(llama_text_encoder_config)
text_encoder = LlavaForConditionalGeneration(llava_text_encoder_config)
tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
torch.manual_seed(0)
@@ -153,14 +168,14 @@ class HunyuanVideoImageToVideoPipelineFastTests(
torch.manual_seed(0)
image_processor = CLIPImageProcessor(
crop_size=336,
crop_size=224,
do_center_crop=True,
do_normalize=True,
do_resize=True,
image_mean=[0.48145466, 0.4578275, 0.40821073],
image_std=[0.26862954, 0.26130258, 0.27577711],
resample=3,
size=336,
size=224,
)
components = {
@@ -188,8 +203,21 @@ class HunyuanVideoImageToVideoPipelineFastTests(
"image": image,
"prompt": "dance monkey",
"prompt_template": {
"template": "{}",
"crop_start": 0,
"template": (
"<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
"1. The main content and theme of the video."
"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
"4. background environment, light, style and atmosphere."
"5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
"<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n"
),
"crop_start": 5,
"image_emb_len": 49,
"image_emb_start": 5,
"image_emb_end": 54,
"double_return_token_id": 10,
},
"generator": generator,
"num_inference_steps": 2,
@@ -197,7 +225,7 @@ class HunyuanVideoImageToVideoPipelineFastTests(
"height": image_height,
"width": image_width,
"num_frames": 9,
"max_sequence_length": 16,
"max_sequence_length": 64,
"output_type": "pt",
}
return inputs