From 474a248f10a9ec3d02fb9bd5f220d5bec158bfd3 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 24 Jun 2025 13:49:37 +0530
Subject: [PATCH] [tests] Fix HunyuanVideo Framepack device tests (#11789)

update
---
 .../test_hunyuan_video_framepack.py           | 25 ++++++++++++++++++-
 .../pipelines/hunyuandit/test_hunyuan_dit.py  | 12 +++++++--
 tests/pipelines/test_pipelines_common.py      |  7 +++---
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/tests/pipelines/hunyuan_video/test_hunyuan_video_framepack.py b/tests/pipelines/hunyuan_video/test_hunyuan_video_framepack.py
index f4408e7cd5..9f685d34c9 100644
--- a/tests/pipelines/hunyuan_video/test_hunyuan_video_framepack.py
+++ b/tests/pipelines/hunyuan_video/test_hunyuan_video_framepack.py
@@ -71,7 +71,6 @@ class HunyuanVideoFramepackPipelineFastTests(
     )
 
     supports_dduf = False
-    # there is no xformers processor for Flux
     test_xformers_attention = False
     test_layerwise_casting = True
     test_group_offloading = True
@@ -360,6 +359,30 @@ class HunyuanVideoFramepackPipelineFastTests(
             "VAE tiling should not affect the inference results",
         )
 
+    def test_float16_inference(self, expected_max_diff=0.2):
+        # NOTE: this test needs a higher tolerance because of multiple forwards through
+        # the model, which compounds the overall fp32 vs fp16 numerical differences. It
+        # shouldn't be expected that the results are the same, so we bump the tolerance.
+        return super().test_float16_inference(expected_max_diff)
+
+    @unittest.skip("The image_encoder uses SiglipVisionModel, which does not support sequential CPU offloading.")
+    def test_sequential_cpu_offload_forward_pass(self):
+        # https://github.com/huggingface/transformers/blob/21cb353b7b4f77c6f5f5c3341d660f86ff416d04/src/transformers/models/siglip/modeling_siglip.py#L803
+        # This is because it instantiates it's attention layer from torch.nn.MultiheadAttention, which calls to
+        # `torch.nn.functional.multi_head_attention_forward` with the weights and bias. Since the hook is never
+        # triggered with a forward pass call, the weights stay on the CPU. There are more examples where we skip
+        # this test because of MHA (example: HunyuanDiT because of AttentionPooling layer).
+        pass
+
+    @unittest.skip("The image_encoder uses SiglipVisionModel, which does not support sequential CPU offloading.")
+    def test_sequential_offload_forward_pass_twice(self):
+        # https://github.com/huggingface/transformers/blob/21cb353b7b4f77c6f5f5c3341d660f86ff416d04/src/transformers/models/siglip/modeling_siglip.py#L803
+        # This is because it instantiates it's attention layer from torch.nn.MultiheadAttention, which calls to
+        # `torch.nn.functional.multi_head_attention_forward` with the weights and bias. Since the hook is never
+        # triggered with a forward pass call, the weights stay on the CPU. There are more examples where we skip
+        # this test because of MHA (example: HunyuanDiT because of AttentionPooling layer).
+        pass
+
     # TODO(aryan): Create a dummy gemma model with smol vocab size
     @unittest.skip(
         "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
diff --git a/tests/pipelines/hunyuandit/test_hunyuan_dit.py b/tests/pipelines/hunyuandit/test_hunyuan_dit.py
index 5aa6372a89..7a5f807213 100644
--- a/tests/pipelines/hunyuandit/test_hunyuan_dit.py
+++ b/tests/pipelines/hunyuandit/test_hunyuan_dit.py
@@ -124,14 +124,22 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         max_diff = np.abs(image_slice.flatten() - expected_slice).max()
         self.assertLessEqual(max_diff, 1e-3)
 
-    @unittest.skip("Not supported.")
+    @unittest.skip("The HunyuanDiT Attention pooling layer does not support sequential CPU offloading.")
     def test_sequential_cpu_offload_forward_pass(self):
         # TODO(YiYi) need to fix later
+        # This is because it instantiates it's attention layer from torch.nn.MultiheadAttention, which calls to
+        # `torch.nn.functional.multi_head_attention_forward` with the weights and bias. Since the hook is never
+        # triggered with a forward pass call, the weights stay on the CPU. There are more examples where we skip
+        # this test because of MHA (example: HunyuanVideo Framepack)
         pass
 
-    @unittest.skip("Not supported.")
+    @unittest.skip("The HunyuanDiT Attention pooling layer does not support sequential CPU offloading.")
     def test_sequential_offload_forward_pass_twice(self):
         # TODO(YiYi) need to fix later
+        # This is because it instantiates it's attention layer from torch.nn.MultiheadAttention, which calls to
+        # `torch.nn.functional.multi_head_attention_forward` with the weights and bias. Since the hook is never
+        # triggered with a forward pass call, the weights stay on the CPU. There are more examples where we skip
+        # this test because of MHA (example: HunyuanVideo Framepack)
         pass
 
     def test_inference_batch_single_identical(self):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 687a28294c..207cff2a3c 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -2270,9 +2270,10 @@ class PipelineTesterMixin:
                         if hasattr(module, "_diffusers_hook")
                     )
                 )
-            for component_name in ["vae", "vqvae"]:
-                if hasattr(pipe, component_name):
-                    getattr(pipe, component_name).to(torch_device)
+            for component_name in ["vae", "vqvae", "image_encoder"]:
+                component = getattr(pipe, component_name, None)
+                if isinstance(component, torch.nn.Module):
+                    component.to(torch_device)
 
         def run_forward(pipe):
             torch.manual_seed(0)