From 54f008e30b57306b6e22477769f2635c21b12a7b Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 30 Nov 2025 13:17:31 -1000 Subject: [PATCH 1/4] Update docs/source/en/api/pipelines/hunyuan_video15.md --- docs/source/en/api/pipelines/hunyuan_video15.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/docs/source/en/api/pipelines/hunyuan_video15.md b/docs/source/en/api/pipelines/hunyuan_video15.md index 2d74088355..f7c7147f80 100644 --- a/docs/source/en/api/pipelines/hunyuan_video15.md +++ b/docs/source/en/api/pipelines/hunyuan_video15.md @@ -12,13 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. --> -
-
- - LoRA - -
-
# HunyuanVideo-1.5 From 237d318e85136c36dfc9551f6681f636f79c7c30 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Sun, 30 Nov 2025 13:22:43 -1000 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Sayak Paul --- .../models/autoencoders/autoencoder_kl_hunyuanvideo15.py | 5 +---- .../models/transformers/transformer_hunyuan_video15.py | 7 ++----- .../pipelines/hunyuan_video1_5/image_processor.py | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py index 7d6a636a24..4b1beb74a3 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanvideo15.py @@ -215,8 +215,6 @@ class HunyuanVideo15Downsample(nn.Module): def __init__(self, in_channels: int, out_channels: int, add_temporal_downsample: bool = True): super().__init__() factor = 2 * 2 * 2 if add_temporal_downsample else 1 * 2 * 2 - assert out_channels % factor == 0 - # self.conv = Conv3d(in_channels, out_channels // factor, kernel_size=3, stride=1, padding=1) self.conv = HunyuanVideo15CausalConv3d(in_channels, out_channels // factor, kernel_size=3) self.add_temporal_downsample = add_temporal_downsample @@ -531,7 +529,6 @@ class HunyuanVideo15Encoder3D(nn.Module): hidden_states = self.mid_block(hidden_states) - # short_cut = rearrange(hidden_states, "b (c r) f h w -> b c r f h w", r=self.group_size).mean(dim=2) batch_size, _, frame, height, width = hidden_states.shape short_cut = hidden_states.view(batch_size, -1, self.group_size, frame, height, width).mean(dim=2) @@ -546,7 +543,7 @@ class HunyuanVideo15Encoder3D(nn.Module): class HunyuanVideo15Decoder3D(nn.Module): r""" - Causal decoder for 3D video-like data used for HunyuanImage-2.1 Refiner. + Causal decoder for 3D video-like data used for HunyuanImage-1.5 Refiner. """ def __init__( diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video15.py b/src/diffusers/models/transformers/transformer_hunyuan_video15.py index 8f191e7500..b870b15dad 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video15.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video15.py @@ -184,10 +184,7 @@ class HunyuanVideo15TimeEmbedding(nn.Module): The dimension of the output embedding. """ - def __init__( - self, - embedding_dim: int, - ): + def __init__(self, embedding_dim: int): super().__init__() self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0) @@ -362,7 +359,7 @@ class HunyuanVideo15RotaryPosEmbed(nn.Module): rope_sizes = [num_frames // self.patch_size_t, height // self.patch_size, width // self.patch_size] axes_grids = [] - for i in range(3): + for i in range(len(rope_sizes)): # Note: The following line diverges from original behaviour. We create the grid on the device, whereas # original implementation creates it on CPU and then moves it to device. This results in numerical # differences in layerwise debugging outputs, but visually it is the same. diff --git a/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py b/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py index b0c7782352..82817365b6 100644 --- a/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py +++ b/src/diffusers/pipelines/hunyuan_video1_5/image_processor.py @@ -34,7 +34,7 @@ def generate_crop_size_list(base_size=256, patch_size=16, max_ratio=4.0): return crop_size_list -# copied fromhttps://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38 +# copied from https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5/blob/main/hyvideo/utils/data_utils.py#L38 def get_closest_ratio(height: float, width: float, ratios: list, buckets: list): """ Get the closest ratio in the buckets. From d7f399d1b261021effacb7587aa455da7a1fefe5 Mon Sep 17 00:00:00 2001 From: "yiyi@huggingface.co" Date: Mon, 1 Dec 2025 00:33:00 +0000 Subject: [PATCH 3/4] add a notes on the doc about attention backend --- docs/source/en/api/pipelines/hunyuan_video15.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/source/en/api/pipelines/hunyuan_video15.md b/docs/source/en/api/pipelines/hunyuan_video15.md index f7c7147f80..531741d924 100644 --- a/docs/source/en/api/pipelines/hunyuan_video15.md +++ b/docs/source/en/api/pipelines/hunyuan_video15.md @@ -52,6 +52,18 @@ video = pipeline(prompt=prompt, num_frames=61, num_inference_steps=30).frames[0] export_to_video(video, "output.mp4", fps=15) ``` +## Notes + +- HunyuanVideo1.5 use attention masks with avariable-length sequences. For best performance, we recommend using an attention backend that handles padding efficiently. + + - **H100/H800:** `_flash_3_hub` or `_flash_varlen_3` + - **A100/A800/RTX 4090:** `flash` or `flash_varlen` + - **Other GPUs:** `sage` + +```py +pipe.transformer.set_attention_backend("flash_varlen") # or your preferred backend +``` + ## HunyuanVideo15Pipeline From bdfab30766bf0bddf3aa5dd3487e18bfc263b78d Mon Sep 17 00:00:00 2001 From: "yiyi@huggingface.co" Date: Mon, 1 Dec 2025 00:35:30 +0000 Subject: [PATCH 4/4] up --- docs/source/en/api/pipelines/hunyuan_video15.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/en/api/pipelines/hunyuan_video15.md b/docs/source/en/api/pipelines/hunyuan_video15.md index 531741d924..9a9bdcb352 100644 --- a/docs/source/en/api/pipelines/hunyuan_video15.md +++ b/docs/source/en/api/pipelines/hunyuan_video15.md @@ -60,6 +60,9 @@ export_to_video(video, "output.mp4", fps=15) - **A100/A800/RTX 4090:** `flash` or `flash_varlen` - **Other GPUs:** `sage` +Refer to the [Attention backends](../../optimization/attention_backends) guide for more details about using a different backend. + + ```py pipe.transformer.set_attention_backend("flash_varlen") # or your preferred backend ```