From b8a4cbac14d32afa6c6e6c5b9cd17f9715214220 Mon Sep 17 00:00:00 2001
From: naykun <yankun1138283845@foxmail.com>
Date: Mon, 15 Dec 2025 15:05:01 +0800
Subject: [PATCH 01/11] [qwen-image] edit 2511 support (#12839)

* [qwen-image] edit 2511 support

* Apply style fixes

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../transformers/transformer_qwenimage.py     | 73 +++++++++++++++++--
 1 file changed, 68 insertions(+), 5 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
index c0fa031b9f..3adfcdb147 100644
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -14,6 +14,7 @@
 
 import functools
 import math
+from math import prod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -363,7 +364,13 @@ class QwenDoubleStreamAttnProcessor2_0:
 @maybe_allow_in_graph
 class QwenImageTransformerBlock(nn.Module):
     def __init__(
-        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+        zero_cond_t: bool = False,
     ):
         super().__init__()
 
@@ -403,10 +410,43 @@ class QwenImageTransformerBlock(nn.Module):
         self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
         self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
 
-    def _modulate(self, x, mod_params):
+        self.zero_cond_t = zero_cond_t
+
+    def _modulate(self, x, mod_params, index=None):
         """Apply modulation to input tensor"""
+        # x: b l d, shift: b d, scale: b d, gate: b d
         shift, scale, gate = mod_params.chunk(3, dim=-1)
-        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+
+        if index is not None:
+            # Assuming mod_params batch dim is 2*actual_batch (chunked into 2 parts)
+            # So shift, scale, gate have shape [2*actual_batch, d]
+            actual_batch = shift.size(0) // 2
+            shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:]  # each: [actual_batch, d]
+            scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:]
+            gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:]
+
+            # index: [b, l] where b is actual batch size
+            # Expand to [b, l, 1] to match feature dimension
+            index_expanded = index.unsqueeze(-1)  # [b, l, 1]
+
+            # Expand chunks to [b, 1, d] then broadcast to [b, l, d]
+            shift_0_exp = shift_0.unsqueeze(1)  # [b, 1, d]
+            shift_1_exp = shift_1.unsqueeze(1)  # [b, 1, d]
+            scale_0_exp = scale_0.unsqueeze(1)
+            scale_1_exp = scale_1.unsqueeze(1)
+            gate_0_exp = gate_0.unsqueeze(1)
+            gate_1_exp = gate_1.unsqueeze(1)
+
+            # Use torch.where to select based on index
+            shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp)
+            scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp)
+            gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp)
+        else:
+            shift_result = shift.unsqueeze(1)
+            scale_result = scale.unsqueeze(1)
+            gate_result = gate.unsqueeze(1)
+
+        return x * (1 + scale_result) + shift_result, gate_result
 
     def forward(
         self,
@@ -416,9 +456,13 @@ class QwenImageTransformerBlock(nn.Module):
         temb: torch.Tensor,
         image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        modulate_index: Optional[List[int]] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Get modulation parameters for both streams
         img_mod_params = self.img_mod(temb)  # [B, 6*dim]
+
+        if self.zero_cond_t:
+            temb = torch.chunk(temb, 2, dim=0)[0]
         txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
 
         # Split modulation parameters for norm1 and norm2
@@ -427,7 +471,7 @@ class QwenImageTransformerBlock(nn.Module):
 
         # Process image stream - norm1 + modulation
         img_normed = self.img_norm1(hidden_states)
-        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1, modulate_index)
 
         # Process text stream - norm1 + modulation
         txt_normed = self.txt_norm1(encoder_hidden_states)
@@ -457,7 +501,7 @@ class QwenImageTransformerBlock(nn.Module):
 
         # Process image stream - norm2 + MLP
         img_normed2 = self.img_norm2(hidden_states)
-        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
+        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2, modulate_index)
         img_mlp_output = self.img_mlp(img_modulated2)
         hidden_states = hidden_states + img_gate2 * img_mlp_output
 
@@ -533,6 +577,7 @@ class QwenImageTransformer2DModel(
         joint_attention_dim: int = 3584,
         guidance_embeds: bool = False,  # TODO: this should probably be removed
         axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+        zero_cond_t: bool = False,
     ):
         super().__init__()
         self.out_channels = out_channels or in_channels
@@ -553,6 +598,7 @@ class QwenImageTransformer2DModel(
                     dim=self.inner_dim,
                     num_attention_heads=num_attention_heads,
                     attention_head_dim=attention_head_dim,
+                    zero_cond_t=zero_cond_t,
                 )
                 for _ in range(num_layers)
             ]
@@ -562,6 +608,7 @@ class QwenImageTransformer2DModel(
         self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
 
         self.gradient_checkpointing = False
+        self.zero_cond_t = zero_cond_t
 
     def forward(
         self,
@@ -618,6 +665,17 @@ class QwenImageTransformer2DModel(
         hidden_states = self.img_in(hidden_states)
 
         timestep = timestep.to(hidden_states.dtype)
+
+        if self.zero_cond_t:
+            timestep = torch.cat([timestep, timestep * 0], dim=0)
+            modulate_index = torch.tensor(
+                [[0] * prod(sample[0]) + [1] * sum([prod(s) for s in sample[1:]]) for sample in img_shapes],
+                device=timestep.device,
+                dtype=torch.int,
+            )
+        else:
+            modulate_index = None
+
         encoder_hidden_states = self.txt_norm(encoder_hidden_states)
         encoder_hidden_states = self.txt_in(encoder_hidden_states)
 
@@ -641,6 +699,8 @@ class QwenImageTransformer2DModel(
                     encoder_hidden_states_mask,
                     temb,
                     image_rotary_emb,
+                    attention_kwargs,
+                    modulate_index,
                 )
 
             else:
@@ -651,6 +711,7 @@ class QwenImageTransformer2DModel(
                     temb=temb,
                     image_rotary_emb=image_rotary_emb,
                     joint_attention_kwargs=attention_kwargs,
+                    modulate_index=modulate_index,
                 )
 
             # controlnet residual
@@ -659,6 +720,8 @@ class QwenImageTransformer2DModel(
                 interval_control = int(np.ceil(interval_control))
                 hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
 
+        if self.zero_cond_t:
+            temb = temb.chunk(2, dim=0)[0]
         # Use only the image part (hidden_states) from the dual-stream blocks
         hidden_states = self.norm_out(hidden_states, temb)
         output = self.proj_out(hidden_states)

From 0c1ccc0775519d23a01732d62cca202215e8956b Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Mon, 15 Dec 2025 17:06:01 +0800
Subject: [PATCH 02/11] =?UTF-8?q?fix=20pytest=20tests/pipelines/pixart=5Fs?=
 =?UTF-8?q?igma/test=5Fpixart.py::PixArtSigmaPi=E2=80=A6=20(#12842)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix pytest tests/pipelines/pixart_sigma/test_pixart.py::PixArtSigmaPipelineIntegrationTests::test_pixart_512 in xpu

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 tests/pipelines/pixart_sigma/test_pixart.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/pixart_sigma/test_pixart.py b/tests/pipelines/pixart_sigma/test_pixart.py
index 2cb80df81a..6e8535062a 100644
--- a/tests/pipelines/pixart_sigma/test_pixart.py
+++ b/tests/pipelines/pixart_sigma/test_pixart.py
@@ -29,6 +29,7 @@ from diffusers import (
 )
 
 from ...testing_utils import (
+    Expectations,
     backend_empty_cache,
     enable_full_determinism,
     numpy_cosine_similarity_distance,
@@ -335,7 +336,14 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
         image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
 
         image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017])
+
+        expected_slices = Expectations(
+            {
+                ("xpu", 3): np.array([0.0417, 0.0388, 0.0061, 0.0618, 0.0517, 0.0420, 0.1038, 0.1055, 0.1257]),
+                ("cuda", None): np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017]),
+            }
+        )
+        expected_slice = expected_slices.get_expectation()
 
         max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
         self.assertLessEqual(max_diff, 1e-4)

From 58519283e7fa143d3ae2bc086fcf53264cf2ece3 Mon Sep 17 00:00:00 2001
From: Yuqian Hong <austin712@mail.ustc.edu.cn>
Date: Mon, 15 Dec 2025 18:22:42 +0800
Subject: [PATCH 03/11] Support for control-lora (#10686)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* run control-lora on diffusers

* cannot load lora adapter

* test

* 1

* add control-lora

* 1

* 1

* 1

* fix PeftAdapterMixin

* fix module_to_save bug

* delete json print

* resolve conflits

* merged but bug

* change peft.py

* 1

* delete state_dict print

* fix alpha

* Create control_lora.py

* Add files via upload

* rename

* no need modify as peft updated

* add doc

* fix code style

* styling isn't that hard 😉

* empty

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 docs/source/en/api/models/controlnet.md       |  15 ++
 .../research_projects/control_lora/README.md  |  41 ++++
 .../control_lora/control_lora.py              |  58 ++++++
 src/diffusers/loaders/peft.py                 |  16 ++
 .../models/controlnets/controlnet.py          |   3 +-
 src/diffusers/utils/__init__.py               |   1 +
 src/diffusers/utils/state_dict_utils.py       | 179 ++++++++++++++++++
 7 files changed, 312 insertions(+), 1 deletion(-)
 create mode 100644 examples/research_projects/control_lora/README.md
 create mode 100644 examples/research_projects/control_lora/control_lora.py

diff --git a/docs/source/en/api/models/controlnet.md b/docs/source/en/api/models/controlnet.md
index f56b7383a0..0821d63fd1 100644
--- a/docs/source/en/api/models/controlnet.md
+++ b/docs/source/en/api/models/controlnet.md
@@ -33,6 +33,21 @@ url = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/m
 pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
 ```
 
+## Loading from Control LoRA
+
+Control-LoRA is introduced by Stability AI in [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) by adding low-rank parameter efficient fine tuning to ControlNet. This approach offers a more efficient and compact method to bring model control to a wider variety of consumer GPUs.
+
+```py
+from diffusers import ControlNetModel, UNet2DConditionModel
+
+lora_id = "stabilityai/control-lora"
+lora_filename = "control-LoRAs-rank128/control-lora-canny-rank128.safetensors"
+
+unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", torch_dtype=torch.bfloat16).to("cuda")
+controlnet = ControlNetModel.from_unet(unet).to(device="cuda", dtype=torch.bfloat16)
+controlnet.load_lora_adapter(lora_id, weight_name=lora_filename, prefix=None, controlnet_config=controlnet.config)
+```
+
 ## ControlNetModel
 
 [[autodoc]] ControlNetModel
diff --git a/examples/research_projects/control_lora/README.md b/examples/research_projects/control_lora/README.md
new file mode 100644
index 0000000000..49aa848e3e
--- /dev/null
+++ b/examples/research_projects/control_lora/README.md
@@ -0,0 +1,41 @@
+# Control-LoRA inference example
+
+Control-LoRA is introduced by Stability AI in [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) by adding low-rank parameter efficient fine tuning to ControlNet. This approach offers a more efficient and compact method to bring model control to a wider variety of consumer GPUs.
+
+## Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+## Inference on SDXL
+
+[stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora) provides a set of Control-LoRA weights for SDXL. Here we use the `canny` condition to generate an image from a text prompt and a reference image.
+
+```bash
+python control_lora.py
+```
+
+## Acknowledgements
+
+- [stabilityai/control-lora](https://huggingface.co/stabilityai/control-lora)
+- [comfyanonymous/ControlNet-v1-1_fp16_safetensors](https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors)
+- [HighCWu/control-lora-v2](https://github.com/HighCWu/control-lora-v2)
\ No newline at end of file
diff --git a/examples/research_projects/control_lora/control_lora.py b/examples/research_projects/control_lora/control_lora.py
new file mode 100644
index 0000000000..a0ad1981c7
--- /dev/null
+++ b/examples/research_projects/control_lora/control_lora.py
@@ -0,0 +1,58 @@
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionXLControlNetPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.utils import load_image, make_image_grid
+
+
+pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
+lora_id = "stabilityai/control-lora"
+lora_filename = "control-LoRAs-rank128/control-lora-canny-rank128.safetensors"
+
+unet = UNet2DConditionModel.from_pretrained(pipe_id, subfolder="unet", torch_dtype=torch.bfloat16).to("cuda")
+controlnet = ControlNetModel.from_unet(unet).to(device="cuda", dtype=torch.bfloat16)
+controlnet.load_lora_adapter(lora_id, weight_name=lora_filename, prefix=None, controlnet_config=controlnet.config)
+
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = "low quality, bad quality, sketches"
+
+image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+controlnet_conditioning_scale = 1.0  # recommended for good generalization
+
+vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", torch_dtype=torch.bfloat16)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    pipe_id,
+    unet=unet,
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.bfloat16,
+    safety_checker=None,
+).to("cuda")
+
+image = np.array(image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+image = Image.fromarray(image)
+
+images = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    image=image,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+    num_images_per_prompt=4,
+).images
+
+final_image = [image] + images
+grid = make_image_grid(final_image, 1, 5)
+grid.save("hf-logo_canny.png")
diff --git a/src/diffusers/loaders/peft.py b/src/diffusers/loaders/peft.py
index 3f8519bbfa..30a78f00b3 100644
--- a/src/diffusers/loaders/peft.py
+++ b/src/diffusers/loaders/peft.py
@@ -27,6 +27,7 @@ from ..utils import (
     MIN_PEFT_VERSION,
     USE_PEFT_BACKEND,
     check_peft_version,
+    convert_sai_sd_control_lora_state_dict_to_peft,
     convert_unet_state_dict_to_peft,
     delete_adapter_layers,
     get_adapter_name,
@@ -232,6 +233,13 @@ class PeftAdapterMixin:
             if "lora_A" not in first_key:
                 state_dict = convert_unet_state_dict_to_peft(state_dict)
 
+            # Control LoRA from SAI is different from BFL Control LoRA
+            # https://huggingface.co/stabilityai/control-lora
+            # https://huggingface.co/comfyanonymous/ControlNet-v1-1_fp16_safetensors
+            is_sai_sd_control_lora = "lora_controlnet" in state_dict
+            if is_sai_sd_control_lora:
+                state_dict = convert_sai_sd_control_lora_state_dict_to_peft(state_dict)
+
             rank = {}
             for key, val in state_dict.items():
                 # Cannot figure out rank from lora layers that don't have at least 2 dimensions.
@@ -263,6 +271,14 @@ class PeftAdapterMixin:
                 adapter_name=adapter_name,
             )
 
+            # Adjust LoRA config for Control LoRA
+            if is_sai_sd_control_lora:
+                lora_config.lora_alpha = lora_config.r
+                lora_config.alpha_pattern = lora_config.rank_pattern
+                lora_config.bias = "all"
+                lora_config.modules_to_save = lora_config.exclude_modules
+                lora_config.exclude_modules = None
+
             # <Unsafe code
             # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
             # Now we remove any existing hooks to `_pipeline`.
diff --git a/src/diffusers/models/controlnets/controlnet.py b/src/diffusers/models/controlnets/controlnet.py
index 5c89c9267d..0b5b9fa3ef 100644
--- a/src/diffusers/models/controlnets/controlnet.py
+++ b/src/diffusers/models/controlnets/controlnet.py
@@ -19,6 +19,7 @@ from torch import nn
 from torch.nn import functional as F
 
 from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
 from ...loaders.single_file_model import FromOriginalModelMixin
 from ...utils import BaseOutput, logging
 from ..attention import AttentionMixin
@@ -106,7 +107,7 @@ class ControlNetConditioningEmbedding(nn.Module):
         return embedding
 
 
-class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin):
+class ControlNetModel(ModelMixin, AttentionMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
     """
     A ControlNet model.
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 6884d3be92..440e4539e7 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -143,6 +143,7 @@ from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_p
 from .remote_utils import remote_decode
 from .state_dict_utils import (
     convert_all_state_dict_to_peft,
+    convert_sai_sd_control_lora_state_dict_to_peft,
     convert_state_dict_to_diffusers,
     convert_state_dict_to_kohya,
     convert_state_dict_to_peft,
diff --git a/src/diffusers/utils/state_dict_utils.py b/src/diffusers/utils/state_dict_utils.py
index 50bfce8b15..c9bf5fec28 100644
--- a/src/diffusers/utils/state_dict_utils.py
+++ b/src/diffusers/utils/state_dict_utils.py
@@ -56,6 +56,36 @@ UNET_TO_DIFFUSERS = {
     ".to_out.lora_magnitude_vector": ".to_out.0.lora_magnitude_vector",
 }
 
+CONTROL_LORA_TO_DIFFUSERS = {
+    ".to_q.down": ".to_q.lora_A.weight",
+    ".to_q.up": ".to_q.lora_B.weight",
+    ".to_k.down": ".to_k.lora_A.weight",
+    ".to_k.up": ".to_k.lora_B.weight",
+    ".to_v.down": ".to_v.lora_A.weight",
+    ".to_v.up": ".to_v.lora_B.weight",
+    ".to_out.0.down": ".to_out.0.lora_A.weight",
+    ".to_out.0.up": ".to_out.0.lora_B.weight",
+    ".ff.net.0.proj.down": ".ff.net.0.proj.lora_A.weight",
+    ".ff.net.0.proj.up": ".ff.net.0.proj.lora_B.weight",
+    ".ff.net.2.down": ".ff.net.2.lora_A.weight",
+    ".ff.net.2.up": ".ff.net.2.lora_B.weight",
+    ".proj_in.down": ".proj_in.lora_A.weight",
+    ".proj_in.up": ".proj_in.lora_B.weight",
+    ".proj_out.down": ".proj_out.lora_A.weight",
+    ".proj_out.up": ".proj_out.lora_B.weight",
+    ".conv.down": ".conv.lora_A.weight",
+    ".conv.up": ".conv.lora_B.weight",
+    **{f".conv{i}.down": f".conv{i}.lora_A.weight" for i in range(1, 3)},
+    **{f".conv{i}.up": f".conv{i}.lora_B.weight" for i in range(1, 3)},
+    "conv_in.down": "conv_in.lora_A.weight",
+    "conv_in.up": "conv_in.lora_B.weight",
+    ".conv_shortcut.down": ".conv_shortcut.lora_A.weight",
+    ".conv_shortcut.up": ".conv_shortcut.lora_B.weight",
+    **{f".linear_{i}.down": f".linear_{i}.lora_A.weight" for i in range(1, 3)},
+    **{f".linear_{i}.up": f".linear_{i}.lora_B.weight" for i in range(1, 3)},
+    "time_emb_proj.down": "time_emb_proj.lora_A.weight",
+    "time_emb_proj.up": "time_emb_proj.lora_B.weight",
+}
 
 DIFFUSERS_TO_PEFT = {
     ".q_proj.lora_linear_layer.up": ".q_proj.lora_B",
@@ -259,6 +289,155 @@ def convert_unet_state_dict_to_peft(state_dict):
     return convert_state_dict(state_dict, mapping)
 
 
+def convert_sai_sd_control_lora_state_dict_to_peft(state_dict):
+    def _convert_controlnet_to_diffusers(state_dict):
+        is_sdxl = "input_blocks.11.0.in_layers.0.weight" not in state_dict
+        logger.info(f"Using ControlNet lora ({'SDXL' if is_sdxl else 'SD15'})")
+
+        # Retrieves the keys for the input blocks only
+        num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in state_dict if "input_blocks" in layer})
+        input_blocks = {
+            layer_id: [key for key in state_dict if f"input_blocks.{layer_id}" in key]
+            for layer_id in range(num_input_blocks)
+        }
+        layers_per_block = 2
+
+        # op blocks
+        op_blocks = [key for key in state_dict if "0.op" in key]
+
+        converted_state_dict = {}
+        # Conv in layers
+        for key in input_blocks[0]:
+            diffusers_key = key.replace("input_blocks.0.0", "conv_in")
+            converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        # controlnet time embedding blocks
+        time_embedding_blocks = [key for key in state_dict if "time_embed" in key]
+        for key in time_embedding_blocks:
+            diffusers_key = key.replace("time_embed.0", "time_embedding.linear_1").replace(
+                "time_embed.2", "time_embedding.linear_2"
+            )
+            converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        # controlnet label embedding blocks
+        label_embedding_blocks = [key for key in state_dict if "label_emb" in key]
+        for key in label_embedding_blocks:
+            diffusers_key = key.replace("label_emb.0.0", "add_embedding.linear_1").replace(
+                "label_emb.0.2", "add_embedding.linear_2"
+            )
+            converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        # Down blocks
+        for i in range(1, num_input_blocks):
+            block_id = (i - 1) // (layers_per_block + 1)
+            layer_in_block_id = (i - 1) % (layers_per_block + 1)
+
+            resnets = [
+                key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+            ]
+            for key in resnets:
+                diffusers_key = (
+                    key.replace("in_layers.0", "norm1")
+                    .replace("in_layers.2", "conv1")
+                    .replace("out_layers.0", "norm2")
+                    .replace("out_layers.3", "conv2")
+                    .replace("emb_layers.1", "time_emb_proj")
+                    .replace("skip_connection", "conv_shortcut")
+                )
+                diffusers_key = diffusers_key.replace(
+                    f"input_blocks.{i}.0", f"down_blocks.{block_id}.resnets.{layer_in_block_id}"
+                )
+                converted_state_dict[diffusers_key] = state_dict.get(key)
+
+            if f"input_blocks.{i}.0.op.bias" in state_dict:
+                for key in [key for key in op_blocks if f"input_blocks.{i}.0.op" in key]:
+                    diffusers_key = key.replace(
+                        f"input_blocks.{i}.0.op", f"down_blocks.{block_id}.downsamplers.0.conv"
+                    )
+                    converted_state_dict[diffusers_key] = state_dict.get(key)
+
+            attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+            if attentions:
+                for key in attentions:
+                    diffusers_key = key.replace(
+                        f"input_blocks.{i}.1", f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
+                    )
+                    converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        # controlnet down blocks
+        for i in range(num_input_blocks):
+            converted_state_dict[f"controlnet_down_blocks.{i}.weight"] = state_dict.get(f"zero_convs.{i}.0.weight")
+            converted_state_dict[f"controlnet_down_blocks.{i}.bias"] = state_dict.get(f"zero_convs.{i}.0.bias")
+
+        # Retrieves the keys for the middle blocks only
+        num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in state_dict if "middle_block" in layer})
+        middle_blocks = {
+            layer_id: [key for key in state_dict if f"middle_block.{layer_id}" in key]
+            for layer_id in range(num_middle_blocks)
+        }
+
+        # Mid blocks
+        for key in middle_blocks.keys():
+            diffusers_key = max(key - 1, 0)
+            if key % 2 == 0:
+                for k in middle_blocks[key]:
+                    diffusers_key_hf = (
+                        k.replace("in_layers.0", "norm1")
+                        .replace("in_layers.2", "conv1")
+                        .replace("out_layers.0", "norm2")
+                        .replace("out_layers.3", "conv2")
+                        .replace("emb_layers.1", "time_emb_proj")
+                        .replace("skip_connection", "conv_shortcut")
+                    )
+                    diffusers_key_hf = diffusers_key_hf.replace(
+                        f"middle_block.{key}", f"mid_block.resnets.{diffusers_key}"
+                    )
+                    converted_state_dict[diffusers_key_hf] = state_dict.get(k)
+            else:
+                for k in middle_blocks[key]:
+                    diffusers_key_hf = k.replace(f"middle_block.{key}", f"mid_block.attentions.{diffusers_key}")
+                    converted_state_dict[diffusers_key_hf] = state_dict.get(k)
+
+        # mid block
+        converted_state_dict["controlnet_mid_block.weight"] = state_dict.get("middle_block_out.0.weight")
+        converted_state_dict["controlnet_mid_block.bias"] = state_dict.get("middle_block_out.0.bias")
+
+        # controlnet cond embedding blocks
+        cond_embedding_blocks = {
+            ".".join(layer.split(".")[:2])
+            for layer in state_dict
+            if "input_hint_block" in layer
+            and ("input_hint_block.0" not in layer)
+            and ("input_hint_block.14" not in layer)
+        }
+        num_cond_embedding_blocks = len(cond_embedding_blocks)
+
+        for idx in range(1, num_cond_embedding_blocks + 1):
+            diffusers_idx = idx - 1
+            cond_block_id = 2 * idx
+
+            converted_state_dict[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = state_dict.get(
+                f"input_hint_block.{cond_block_id}.weight"
+            )
+            converted_state_dict[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = state_dict.get(
+                f"input_hint_block.{cond_block_id}.bias"
+            )
+
+        for key in [key for key in state_dict if "input_hint_block.0" in key]:
+            diffusers_key = key.replace("input_hint_block.0", "controlnet_cond_embedding.conv_in")
+            converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        for key in [key for key in state_dict if "input_hint_block.14" in key]:
+            diffusers_key = key.replace("input_hint_block.14", "controlnet_cond_embedding.conv_out")
+            converted_state_dict[diffusers_key] = state_dict.get(key)
+
+        return converted_state_dict
+
+    state_dict = _convert_controlnet_to_diffusers(state_dict)
+    mapping = CONTROL_LORA_TO_DIFFUSERS
+    return convert_state_dict(state_dict, mapping)
+
+
 def convert_all_state_dict_to_peft(state_dict):
     r"""
     Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid

From a748a839add5fe9f45a66e45dd93d8db0b45ce0f Mon Sep 17 00:00:00 2001
From: junqiangwu <32137981+junqiangwu@users.noreply.github.com>
Date: Tue, 16 Dec 2025 01:45:17 +0800
Subject: [PATCH 04/11] Add support for LongCat-Image (#12828)

* Add  LongCat-Image

* Update src/diffusers/models/transformers/transformer_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/models/transformers/transformer_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/models/transformers/transformer_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/models/transformers/transformer_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* fix code

* add doc

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* Update src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py

Co-authored-by: YiYi Xu <yixu310@gmail.com>

* fix code & mask style & fix-copies

* Apply style fixes

* fix single input rewrite error

---------

Co-authored-by: YiYi Xu <yixu310@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: hadoop-imagen <hadoop-imagen@psxfb7pxrbvmh3oq-worker-0.psxfb7pxrbvmh3oq.hadoop-aipnlp.svc.cluster.local>
---
 docs/source/en/_toctree.yml                   |   6 +-
 .../api/models/longcat_image_transformer2d.md |  25 +
 docs/source/en/api/pipelines/longcat_image.md | 114 +++
 src/diffusers/__init__.py                     |   6 +
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/transformers/__init__.py |   1 +
 .../transformers/transformer_longcat_image.py | 548 +++++++++++++
 src/diffusers/pipelines/__init__.py           |   2 +
 .../pipelines/longcat_image/__init__.py       |  51 ++
 .../longcat_image/pipeline_longcat_image.py   | 666 ++++++++++++++++
 .../pipeline_longcat_image_edit.py            | 726 ++++++++++++++++++
 .../longcat_image/pipeline_output.py          |  21 +
 .../longcat_image/system_messages.py          | 142 ++++
 src/diffusers/utils/dummy_pt_objects.py       |  15 +
 .../dummy_torch_and_transformers_objects.py   |  30 +
 tests/pipelines/longcat_image/__init__.py     |   0
 16 files changed, 2354 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/en/api/models/longcat_image_transformer2d.md
 create mode 100644 docs/source/en/api/pipelines/longcat_image.md
 create mode 100644 src/diffusers/models/transformers/transformer_longcat_image.py
 create mode 100644 src/diffusers/pipelines/longcat_image/__init__.py
 create mode 100644 src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py
 create mode 100644 src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
 create mode 100644 src/diffusers/pipelines/longcat_image/pipeline_output.py
 create mode 100644 src/diffusers/pipelines/longcat_image/system_messages.py
 create mode 100644 tests/pipelines/longcat_image/__init__.py

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index c176f8786f..f0cb016443 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -365,6 +365,8 @@
         title: HunyuanVideoTransformer3DModel
       - local: api/models/latte_transformer3d
         title: LatteTransformer3DModel
+      - local: api/models/longcat_image_transformer2d
+        title: LongCatImageTransformer2DModel
       - local: api/models/ltx_video_transformer3d
         title: LTXVideoTransformer3DModel
       - local: api/models/lumina2_transformer2d
@@ -402,7 +404,7 @@
       - local: api/models/wan_transformer_3d
         title: WanTransformer3DModel
       - local: api/models/z_image_transformer2d
-        title: ZImageTransformer2DModel        
+        title: ZImageTransformer2DModel
       title: Transformers
     - sections:
       - local: api/models/stable_cascade_unet
@@ -563,6 +565,8 @@
         title: Latent Diffusion
       - local: api/pipelines/ledits_pp
         title: LEDITS++
+      - local: api/pipelines/longcat_image
+        title: LongCat-Image
       - local: api/pipelines/lumina2
         title: Lumina 2.0
       - local: api/pipelines/lumina
diff --git a/docs/source/en/api/models/longcat_image_transformer2d.md b/docs/source/en/api/models/longcat_image_transformer2d.md
new file mode 100644
index 0000000000..f40b2583e6
--- /dev/null
+++ b/docs/source/en/api/models/longcat_image_transformer2d.md
@@ -0,0 +1,25 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LongCatImageTransformer2DModel
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import LongCatImageTransformer2DModel
+
+transformer = LongCatImageTransformer2DModel.from_pretrained("meituan-longcat/LongCat-Image ", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## LongCatImageTransformer2DModel
+
+[[autodoc]] LongCatImageTransformer2DModel
\ No newline at end of file
diff --git a/docs/source/en/api/pipelines/longcat_image.md b/docs/source/en/api/pipelines/longcat_image.md
new file mode 100644
index 0000000000..a7e8a7a371
--- /dev/null
+++ b/docs/source/en/api/pipelines/longcat_image.md
@@ -0,0 +1,114 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# LongCat-Image
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+
+We introduce LongCat-Image, a pioneering open-source and bilingual (Chinese-English) foundation model for image generation, designed to address core challenges in multilingual text rendering, photorealism, deployment efficiency, and developer accessibility prevalent in current leading models.
+
+
+### Key Features
+- 🌟 **Exceptional Efficiency and Performance**: With only **6B parameters**, LongCat-Image surpasses numerous open-source models that are several times larger across multiple benchmarks, demonstrating the immense potential of efficient model design.
+- 🌟 **Superior Editing Performance**: LongCat-Image-Edit model achieves state-of-the-art performance among open-source models, delivering leading instruction-following and image quality with superior visual consistency.
+- 🌟 **Powerful Chinese Text Rendering**: LongCat-Image demonstrates superior accuracy and stability in rendering common Chinese characters compared to existing SOTA open-source models and achieves industry-leading coverage of the Chinese dictionary.
+- 🌟 **Remarkable Photorealism**: Through an innovative data strategy and training framework, LongCat-Image achieves remarkable photorealism in generated images.
+- 🌟 **Comprehensive Open-Source Ecosystem**: We provide a complete toolchain, from intermediate checkpoints to full training code, significantly lowering the barrier for further research and development.
+
+For more details, please refer to the comprehensive [***LongCat-Image Technical Report***](https://arxiv.org/abs/2412.11963)
+
+
+## Usage Example
+
+```py
+import torch
+import diffusers
+from diffusers import LongCatImagePipeline
+
+weight_dtype = torch.bfloat16
+pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16 )
+pipe.to('cuda')
+# pipe.enable_model_cpu_offload()
+
+prompt = '一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。'
+image = pipe(
+    prompt,
+    height=768,
+    width=1344,
+    guidance_scale=4.0,
+    num_inference_steps=50,
+    num_images_per_prompt=1,
+    generator=torch.Generator("cpu").manual_seed(43),
+    enable_cfg_renorm=True,
+    enable_prompt_rewrite=True,
+).images[0]
+image.save(f'./longcat_image_t2i_example.png')
+```
+
+
+This pipeline was contributed by LongCat-Image Team. The original codebase can be found [here](https://github.com/meituan-longcat/LongCat-Image).
+
+Available models:
+<div style="overflow-x: auto; margin-bottom: 16px;">
+  <table style="border-collapse: collapse; width: 100%;">
+    <thead>
+      <tr>
+        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Models</th>
+        <th style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Type</th>
+        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Description</th>
+        <th style="padding: 8px; border: 1px solid #d0d7de; background-color: #f6f8fa;">Download Link</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Final Release. The standard model for out&#8209;of&#8209;the&#8209;box inference.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image">Huggingface</a></span>
+        </td>
+      </tr>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Dev</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Text&#8209;to&#8209;Image</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Development. Mid-training checkpoint, suitable for fine-tuning.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Dev">Huggingface</a></span>
+        </td>
+      </tr>
+      <tr>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">LongCat&#8209;Image&#8209;Edit</td>
+        <td style="white-space: nowrap; padding: 8px; border: 1px solid #d0d7de;">Image Editing</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">Specialized model for image editing.</td>
+        <td style="padding: 8px; border: 1px solid #d0d7de;">
+          <span style="white-space: nowrap;">🤗&nbsp;<a href="https://huggingface.co/meituan-longcat/LongCat-Image-Edit">Huggingface</a></span>
+        </td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+
+## LongCatImagePipeline
+
+[[autodoc]] LongCatImagePipeline
+- all
+- __call__
+
+## LongCatImagePipelineOutput
+
+[[autodoc]] pipelines.longcat_image.pipeline_output.LongCatImagePipelineOutput
+
+
+
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 29a38b4312..86f196f9be 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -235,6 +235,7 @@ else:
             "Kandinsky3UNet",
             "Kandinsky5Transformer3DModel",
             "LatteTransformer3DModel",
+            "LongCatImageTransformer2DModel",
             "LTXVideoTransformer3DModel",
             "Lumina2Transformer2DModel",
             "LuminaNextDiT2DModel",
@@ -532,6 +533,8 @@ else:
             "LDMTextToImagePipeline",
             "LEditsPPPipelineStableDiffusion",
             "LEditsPPPipelineStableDiffusionXL",
+            "LongCatImageEditPipeline",
+            "LongCatImagePipeline",
             "LTXConditionPipeline",
             "LTXImageToVideoPipeline",
             "LTXLatentUpsamplePipeline",
@@ -970,6 +973,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             Kandinsky3UNet,
             Kandinsky5Transformer3DModel,
             LatteTransformer3DModel,
+            LongCatImageTransformer2DModel,
             LTXVideoTransformer3DModel,
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,
@@ -1237,6 +1241,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             LDMTextToImagePipeline,
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
+            LongCatImageEditPipeline,
+            LongCatImagePipeline,
             LTXConditionPipeline,
             LTXImageToVideoPipeline,
             LTXLatentUpsamplePipeline,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 29d8b0b5a5..4c1b397bdf 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -101,6 +101,7 @@ if is_torch_available():
     _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
     _import_structure["transformers.transformer_hunyuanimage"] = ["HunyuanImageTransformer2DModel"]
     _import_structure["transformers.transformer_kandinsky"] = ["Kandinsky5Transformer3DModel"]
+    _import_structure["transformers.transformer_longcat_image"] = ["LongCatImageTransformer2DModel"]
     _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
     _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
     _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -208,6 +209,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             HunyuanVideoTransformer3DModel,
             Kandinsky5Transformer3DModel,
             LatteTransformer3DModel,
+            LongCatImageTransformer2DModel,
             LTXVideoTransformer3DModel,
             Lumina2Transformer2DModel,
             LuminaNextDiT2DModel,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index a42f6b2716..40b5d4a0df 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -33,6 +33,7 @@ if is_torch_available():
     from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
     from .transformer_hunyuanimage import HunyuanImageTransformer2DModel
     from .transformer_kandinsky import Kandinsky5Transformer3DModel
+    from .transformer_longcat_image import LongCatImageTransformer2DModel
     from .transformer_ltx import LTXVideoTransformer3DModel
     from .transformer_lumina2 import Lumina2Transformer2DModel
     from .transformer_mochi import MochiTransformer3DModel
diff --git a/src/diffusers/models/transformers/transformer_longcat_image.py b/src/diffusers/models/transformers/transformer_longcat_image.py
new file mode 100644
index 0000000000..7fbaaa3fee
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_longcat_image.py
@@ -0,0 +1,548 @@
+# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import is_torch_npu_available, logging
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..cache_utils import CacheMixin
+from ..embeddings import TimestepEmbedding, Timesteps, apply_rotary_emb, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _get_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_fused_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
+    query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+
+    encoder_query = encoder_key = encoder_value = (None,)
+    if encoder_hidden_states is not None and hasattr(attn, "to_added_qkv"):
+        encoder_query, encoder_key, encoder_value = attn.to_added_qkv(encoder_hidden_states).chunk(3, dim=-1)
+
+    return query, key, value, encoder_query, encoder_key, encoder_value
+
+
+def _get_qkv_projections(attn: "LongCatImageAttention", hidden_states, encoder_hidden_states=None):
+    if attn.fused_projections:
+        return _get_fused_projections(attn, hidden_states, encoder_hidden_states)
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+
+
+class LongCatImageAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+
+    def __call__(
+        self,
+        attn: "LongCatImageAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            hidden_states = attn.to_out[0](hidden_states)
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+
+
+class LongCatImageAttention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = LongCatImageAttnProcessor
+    _available_processors = [
+        LongCatImageAttnProcessor,
+    ]
+
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        context_pre_only: Optional[bool] = None,
+        pre_only: bool = False,
+        elementwise_affine: bool = True,
+        processor=None,
+    ):
+        super().__init__()
+
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.dropout = dropout
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.added_proj_bias = added_proj_bias
+
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.pre_only:
+            self.to_out = torch.nn.ModuleList([])
+            self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(torch.nn.Dropout(dropout))
+
+        if added_kv_proj_dim is not None:
+            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
+
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.set_processor(processor)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
+        unused_kwargs = [k for k, _ in kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"joint_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
+
+
+@maybe_allow_in_graph
+class LongCatImageSingleTransformerBlock(nn.Module):
+    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+
+        self.attn = LongCatImageAttention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=LongCatImageAttnProcessor(),
+            eps=1e-6,
+            pre_only=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        text_seq_len = encoder_hidden_states.shape[1]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+
+        encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
+        return encoder_hidden_states, hidden_states
+
+
+@maybe_allow_in_graph
+class LongCatImageTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+
+        self.attn = LongCatImageAttention(
+            query_dim=dim,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=LongCatImageAttnProcessor(),
+            eps=eps,
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3:
+            hidden_states = hidden_states + ip_attn_output
+
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+
+        return encoder_hidden_states, hidden_states
+
+
+class LongCatImagePosEmbed(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        n_axes = ids.shape[-1]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        is_mps = ids.device.type == "mps"
+        is_npu = ids.device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        for i in range(n_axes):
+            cos, sin = get_1d_rotary_pos_embed(
+                self.axes_dim[i],
+                pos[:, i],
+                theta=self.theta,
+                repeat_interleave_real=True,
+                use_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+
+
+class LongCatImageTimestepEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+    def forward(self, timestep, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        return timesteps_emb
+
+
+class LongCatImageTransformer2DModel(
+    ModelMixin,
+    ConfigMixin,
+    PeftAdapterMixin,
+    FromOriginalModelMixin,
+    CacheMixin,
+):
+    """
+    The Transformer model introduced in Longcat-Image.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 3584,
+        pooled_projection_dim: int = 3584,
+        axes_dims_rope: List[int] = [16, 56, 56],
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pooled_projection_dim = pooled_projection_dim
+
+        self.pos_embed = LongCatImagePosEmbed(theta=10000, axes_dim=axes_dims_rope)
+
+        self.time_embed = LongCatImageTimestepEmbeddings(embedding_dim=self.inner_dim)
+
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = torch.nn.Linear(in_channels, self.inner_dim)
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                LongCatImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                LongCatImageSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for i in range(num_single_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+        self.use_checkpoint = [True] * num_layers
+        self.use_single_checkpoint = [True] * num_single_layers
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        hidden_states = self.x_embedder(hidden_states)
+
+        timestep = timestep.to(hidden_states.dtype) * 1000
+
+        temb = self.time_embed(timestep, hidden_states.dtype)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        if is_torch_npu_available():
+            freqs_cos, freqs_sin = self.pos_embed(ids.cpu())
+            image_rotary_emb = (freqs_cos.npu(), freqs_sin.npu())
+        else:
+            image_rotary_emb = self.pos_embed(ids)
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing and self.use_checkpoint[index_block]:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing and self.use_single_checkpoint[index_block]:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 388551f812..ff5cd829ce 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -291,6 +291,7 @@ else:
     _import_structure["lumina"] = ["LuminaPipeline", "LuminaText2ImgPipeline"]
     _import_structure["lumina2"] = ["Lumina2Pipeline", "Lumina2Text2ImgPipeline"]
     _import_structure["lucy"] = ["LucyEditPipeline"]
+    _import_structure["longcat_image"] = ["LongCatImagePipeline", "LongCatImageEditPipeline"]
     _import_structure["marigold"].extend(
         [
             "MarigoldDepthPipeline",
@@ -718,6 +719,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             LEditsPPPipelineStableDiffusion,
             LEditsPPPipelineStableDiffusionXL,
         )
+        from .longcat_image import LongCatImageEditPipeline, LongCatImagePipeline
         from .ltx import LTXConditionPipeline, LTXImageToVideoPipeline, LTXLatentUpsamplePipeline, LTXPipeline
         from .lucy import LucyEditPipeline
         from .lumina import LuminaPipeline, LuminaText2ImgPipeline
diff --git a/src/diffusers/pipelines/longcat_image/__init__.py b/src/diffusers/pipelines/longcat_image/__init__.py
new file mode 100644
index 0000000000..e4bb0e5819
--- /dev/null
+++ b/src/diffusers/pipelines/longcat_image/__init__.py
@@ -0,0 +1,51 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa: F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_longcat_image"] = ["LongCatImagePipeline"]
+    _import_structure["pipeline_longcat_image_edit"] = ["LongCatImageEditPipeline"]
+    _import_structure["pipeline_output"] = ["LongCatImagePipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_longcat_image import LongCatImagePipeline
+        from .pipeline_longcat_image_edit import LongCatImageEditPipeline
+        from .pipeline_output import LongCatImagePipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py
new file mode 100644
index 0000000000..a758d545fa
--- /dev/null
+++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image.py
@@ -0,0 +1,666 @@
+# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import re
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import LongCatImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import LongCatImagePipelineOutput
+from .system_messages import SYSTEM_PROMPT_EN, SYSTEM_PROMPT_ZH
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import LongCatImagePipeline
+
+        >>> pipe = LongCatImagePipeline.from_pretrained("meituan-longcat/LongCat-Image", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "一个年轻的亚裔女性，身穿黄色针织衫，搭配白色项链。她的双手放在膝盖上，表情恬静。背景是一堵粗糙的砖墙，午后的阳光温暖地洒在她身上，营造出一种宁静而温馨的氛围。镜头采用中距离视角，突出她的神态和服饰的细节。光线柔和地打在她的脸上，强调她的五官和饰品的质感，增加画面的层次感与亲和力。整个画面构图简洁，砖墙的纹理与阳光的光影效果相得益彰，突显出人物的优雅与从容。"
+        >>> image = pipe(
+        ...     prompt,
+        ...     height=768,
+        ...     width=1344,
+        ...     num_inference_steps=50,
+        ...     guidance_scale=4.5,
+        ...     generator=torch.Generator("cpu").manual_seed(43),
+        ...     enable_cfg_renorm=True,
+        ... ).images[0]
+        >>> image.save("longcat_image.png")
+        ```
+"""
+
+
+def get_prompt_language(prompt):
+    pattern = re.compile(r"[\u4e00-\u9fff]")
+    if bool(pattern.search(prompt)):
+        return "zh"
+    return "en"
+
+
+def split_quotation(prompt, quote_pairs=None):
+    """
+    Implement a regex-based string splitting algorithm that identifies delimiters defined by single or double quote
+    pairs. Examples::
+        >>> prompt_en = "Please write 'Hello' on the blackboard for me." >>> print(split_quotation(prompt_en)) >>> #
+        output: [('Please write ', False), ("'Hello'", True), (' on the blackboard for me.', False)]
+    """
+    word_internal_quote_pattern = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
+    matches_word_internal_quote_pattern = word_internal_quote_pattern.findall(prompt)
+    mapping_word_internal_quote = []
+
+    for i, word_src in enumerate(set(matches_word_internal_quote_pattern)):
+        word_tgt = "longcat_$##$_longcat" * (i + 1)
+        prompt = prompt.replace(word_src, word_tgt)
+        mapping_word_internal_quote.append([word_src, word_tgt])
+
+    if quote_pairs is None:
+        quote_pairs = [("'", "'"), ('"', '"'), ("‘", "’"), ("“", "”")]
+    pattern = "|".join([re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2) for q1, q2 in quote_pairs])
+    parts = re.split(f"({pattern})", prompt)
+
+    result = []
+    for part in parts:
+        for word_src, word_tgt in mapping_word_internal_quote:
+            part = part.replace(word_tgt, word_src)
+        if re.match(pattern, part):
+            if len(part):
+                result.append((part, True))
+        else:
+            if len(part):
+                result.append((part, False))
+    return result
+
+
+def prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=None, height=None, width=None):
+    if type == "text":
+        assert num_token
+        if height or width:
+            print('Warning: The parameters of height and width will be ignored in "text" type.')
+        pos_ids = torch.zeros(num_token, 3)
+        pos_ids[..., 0] = modality_id
+        pos_ids[..., 1] = torch.arange(num_token) + start[0]
+        pos_ids[..., 2] = torch.arange(num_token) + start[1]
+    elif type == "image":
+        assert height and width
+        if num_token:
+            print('Warning: The parameter of num_token will be ignored in "image" type.')
+        pos_ids = torch.zeros(height, width, 3)
+        pos_ids[..., 0] = modality_id
+        pos_ids[..., 1] = pos_ids[..., 1] + torch.arange(height)[:, None] + start[0]
+        pos_ids[..., 2] = pos_ids[..., 2] + torch.arange(width)[None, :] + start[1]
+        pos_ids = pos_ids.reshape(height * width, 3)
+    else:
+        raise KeyError(f'Unknow type {type}, only support "text" or "image".')
+    return pos_ids
+
+
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class LongCatImagePipeline(DiffusionPipeline, FromSingleFileMixin):
+    r"""
+    The pipeline for text-to-image generation.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2Tokenizer,
+        text_processor: Qwen2VLProcessor,
+        transformer: LongCatImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_processor=text_processor,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+
+        self.prompt_template_encode_prefix = "<|im_start|>system\nAs an image captioning expert, generate a descriptive text prompt based on an image content, suitable for input to a text-to-image model.<|im_end|>\n<|im_start|>user\n"
+        self.prompt_template_encode_suffix = "<|im_end|>\n<|im_start|>assistant\n"
+        self.default_sample_size = 128
+        self.tokenizer_max_length = 512
+
+    def rewire_prompt(self, prompt, device):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        all_text = []
+        for each_prompt in prompt:
+            language = get_prompt_language(each_prompt)
+            if language == "zh":
+                question = SYSTEM_PROMPT_ZH + f"\n用户输入为：{each_prompt}\n改写后的prompt为："
+            else:
+                question = SYSTEM_PROMPT_EN + f"\nUser Input: {each_prompt}\nRewritten prompt:"
+            message = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                    ],
+                }
+            ]
+            # Preparation for inference
+            text = self.text_processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
+            all_text.append(text)
+
+        inputs = self.text_processor(text=all_text, padding=True, return_tensors="pt").to(device)
+
+        self.text_encoder.to(device)
+        generated_ids = self.text_encoder.generate(**inputs, max_new_tokens=self.tokenizer_max_length)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        output_text = self.text_processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rewrite_prompt = output_text
+        return rewrite_prompt
+
+    def _encode_prompt(self, prompt: List[str]):
+        batch_all_tokens = []
+
+        for each_prompt in prompt:
+            all_tokens = []
+            for clean_prompt_sub, matched in split_quotation(each_prompt):
+                if matched:
+                    for sub_word in clean_prompt_sub:
+                        tokens = self.tokenizer(sub_word, add_special_tokens=False)["input_ids"]
+                        all_tokens.extend(tokens)
+                else:
+                    tokens = self.tokenizer(clean_prompt_sub, add_special_tokens=False)["input_ids"]
+                    all_tokens.extend(tokens)
+
+            if len(all_tokens) > self.tokenizer_max_length:
+                logger.warning(
+                    "Your input was truncated because `max_sequence_length` is set to "
+                    f" {self.tokenizer_max_length} input token nums : {len(all_tokens)}"
+                )
+                all_tokens = all_tokens[: self.tokenizer_max_length]
+            batch_all_tokens.append(all_tokens)
+
+        text_tokens_and_mask = self.tokenizer.pad(
+            {"input_ids": batch_all_tokens},
+            max_length=self.tokenizer_max_length,
+            padding="max_length",
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+
+        prefix_tokens = self.tokenizer(self.prompt_template_encode_prefix, add_special_tokens=False)["input_ids"]
+        suffix_tokens = self.tokenizer(self.prompt_template_encode_suffix, add_special_tokens=False)["input_ids"]
+        prefix_len = len(prefix_tokens)
+        suffix_len = len(suffix_tokens)
+
+        prefix_tokens_mask = torch.tensor([1] * len(prefix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
+        suffix_tokens_mask = torch.tensor([1] * len(suffix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
+
+        prefix_tokens = torch.tensor(prefix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
+        suffix_tokens = torch.tensor(suffix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
+
+        batch_size = text_tokens_and_mask.input_ids.size(0)
+
+        prefix_tokens_batch = prefix_tokens.unsqueeze(0).expand(batch_size, -1)
+        suffix_tokens_batch = suffix_tokens.unsqueeze(0).expand(batch_size, -1)
+        prefix_mask_batch = prefix_tokens_mask.unsqueeze(0).expand(batch_size, -1)
+        suffix_mask_batch = suffix_tokens_mask.unsqueeze(0).expand(batch_size, -1)
+
+        input_ids = torch.cat((prefix_tokens_batch, text_tokens_and_mask.input_ids, suffix_tokens_batch), dim=-1)
+        attention_mask = torch.cat((prefix_mask_batch, text_tokens_and_mask.attention_mask, suffix_mask_batch), dim=-1)
+
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+
+        text_output = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
+        # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+        # clone to have a contiguous tensor
+        prompt_embeds = text_output.hidden_states[-1].detach()
+        prompt_embeds = prompt_embeds[:, prefix_len:-suffix_len, :]
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is None:
+            prompt_embeds = self._encode_prompt(prompt)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        text_ids = prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=prompt_embeds.shape[1]).to(
+            self.device
+        )
+        return prompt_embeds.to(self.device), text_ids
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+        return latents
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = prepare_pos_ids(
+            modality_id=1,
+            type="image",
+            start=(self.tokenizer_max_length, self.tokenizer_max_length),
+            height=height // 2,
+            width=width // 2,
+        ).to(device)
+
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device)
+        latents = latents.to(dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+
+        return latents, latent_image_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    def check_inputs(
+        self, prompt, height, width, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        enable_cfg_renorm: Optional[bool] = True,
+        cfg_renorm_min: Optional[float] = 0.0,
+        enable_prompt_rewrite: Optional[bool] = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            enable_cfg_renorm: Whether to enable cfg_renorm. Enabling cfg_renorm will improve image quality,
+                but it may lead to a decrease in the stability of some image outputs..
+            cfg_renorm_min: The minimum value of the cfg_renorm_scale range (0-1).
+                cfg_renorm_min = 1.0, renorm has no effect, while cfg_renorm_min=0.0, the renorm range is larger.
+            enable_prompt_rewrite: whether to enable prompt rewrite.
+        Examples:
+
+        Returns:
+            [`~pipelines.LongCatImagePipelineOutput`] or `tuple`: [`~pipelines.LongCatImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        if enable_prompt_rewrite:
+            prompt = self.rewire_prompt(prompt, device)
+            logger.info(f"Rewrite prompt {prompt}!")
+
+        negative_prompt = "" if negative_prompt is None else negative_prompt
+        (prompt_embeds, text_ids) = self.encode_prompt(
+            prompt=prompt, prompt_embeds=prompt_embeds, num_images_per_prompt=num_images_per_prompt
+        )
+        if self.do_classifier_free_guidance:
+            (negative_prompt_embeds, negative_text_ids) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = 16
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1.0 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        guidance = None
+
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                with self.transformer.cache_context("cond"):
+                    noise_pred_text = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        return_dict=False,
+                    )[0]
+                if self.do_classifier_free_guidance:
+                    with self.transformer.cache_context("uncond"):
+                        noise_pred_uncond = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            return_dict=False,
+                        )[0]
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    if enable_cfg_renorm:
+                        cond_norm = torch.norm(noise_pred_text, dim=-1, keepdim=True)
+                        noise_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                        scale = (cond_norm / (noise_norm + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                        noise_pred = noise_pred * scale
+                else:
+                    noise_pred = noise_pred_text
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            if latents.dtype != self.vae.dtype:
+                latents = latents.to(dtype=self.vae.dtype)
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return LongCatImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
new file mode 100644
index 0000000000..988aef2125
--- /dev/null
+++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
@@ -0,0 +1,726 @@
+# Copyright 2025 MeiTuan LongCat-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+import re
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.transformers import LongCatImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import LongCatImagePipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from PIL import Image
+        >>> import torch
+        >>> from diffusers import LongCatImageEditPipeline
+
+        >>> pipe = LongCatImageEditPipeline.from_pretrained(
+        ...     "meituan-longcat/LongCat-Image-Edit", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> prompt = "change the cat to dog."
+        >>> input_image = Image.open("test.jpg").convert("RGB")
+        >>> image = pipe(
+        ...     input_image,
+        ...     prompt,
+        ...     num_inference_steps=50,
+        ...     guidance_scale=4.5,
+        ...     generator=torch.Generator("cpu").manual_seed(43),
+        ... ).images[0]
+        >>> image.save("longcat_image_edit.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.split_quotation
+def split_quotation(prompt, quote_pairs=None):
+    """
+    Implement a regex-based string splitting algorithm that identifies delimiters defined by single or double quote
+    pairs. Examples::
+        >>> prompt_en = "Please write 'Hello' on the blackboard for me." >>> print(split_quotation(prompt_en)) >>> #
+        output: [('Please write ', False), ("'Hello'", True), (' on the blackboard for me.', False)]
+    """
+    word_internal_quote_pattern = re.compile(r"[a-zA-Z]+'[a-zA-Z]+")
+    matches_word_internal_quote_pattern = word_internal_quote_pattern.findall(prompt)
+    mapping_word_internal_quote = []
+
+    for i, word_src in enumerate(set(matches_word_internal_quote_pattern)):
+        word_tgt = "longcat_$##$_longcat" * (i + 1)
+        prompt = prompt.replace(word_src, word_tgt)
+        mapping_word_internal_quote.append([word_src, word_tgt])
+
+    if quote_pairs is None:
+        quote_pairs = [("'", "'"), ('"', '"'), ("‘", "’"), ("“", "”")]
+    pattern = "|".join([re.escape(q1) + r"[^" + re.escape(q1 + q2) + r"]*?" + re.escape(q2) for q1, q2 in quote_pairs])
+    parts = re.split(f"({pattern})", prompt)
+
+    result = []
+    for part in parts:
+        for word_src, word_tgt in mapping_word_internal_quote:
+            part = part.replace(word_tgt, word_src)
+        if re.match(pattern, part):
+            if len(part):
+                result.append((part, True))
+        else:
+            if len(part):
+                result.append((part, False))
+    return result
+
+
+# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.prepare_pos_ids
+def prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=None, height=None, width=None):
+    if type == "text":
+        assert num_token
+        if height or width:
+            print('Warning: The parameters of height and width will be ignored in "text" type.')
+        pos_ids = torch.zeros(num_token, 3)
+        pos_ids[..., 0] = modality_id
+        pos_ids[..., 1] = torch.arange(num_token) + start[0]
+        pos_ids[..., 2] = torch.arange(num_token) + start[1]
+    elif type == "image":
+        assert height and width
+        if num_token:
+            print('Warning: The parameter of num_token will be ignored in "image" type.')
+        pos_ids = torch.zeros(height, width, 3)
+        pos_ids[..., 0] = modality_id
+        pos_ids[..., 1] = pos_ids[..., 1] + torch.arange(height)[:, None] + start[0]
+        pos_ids[..., 2] = pos_ids[..., 2] + torch.arange(width)[None, :] + start[1]
+        pos_ids = pos_ids.reshape(height * width, 3)
+    else:
+        raise KeyError(f'Unknow type {type}, only support "text" or "image".')
+    return pos_ids
+
+
+# Copied from diffusers.pipelines.longcat_image.pipeline_longcat_image.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def calculate_dimensions(target_area, ratio):
+    width = math.sqrt(target_area * ratio)
+    height = width / ratio
+
+    width = width if width % 16 == 0 else (width // 16 + 1) * 16
+    height = height if height % 16 == 0 else (height // 16 + 1) * 16
+
+    width = int(width)
+    height = int(height)
+
+    return width, height
+
+
+class LongCatImageEditPipeline(DiffusionPipeline, FromSingleFileMixin):
+    r"""
+    The LongCat-Image-Edit pipeline for image editing.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2Tokenizer,
+        text_processor: Qwen2VLProcessor,
+        transformer: LongCatImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            text_processor=text_processor,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.image_processor_vl = text_processor.image_processor
+
+        self.image_token = "<|image_pad|>"
+        self.prompt_template_encode_prefix = "<|im_start|>system\nAs an image editing expert, first analyze the content and attributes of the input image(s). Then, based on the user's editing instructions, clearly and precisely determine how to modify the given image(s), ensuring that only the specified parts are altered and all other aspects remain consistent with the original(s).<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        self.prompt_template_encode_suffix = "<|im_end|>\n<|im_start|>assistant\n"
+        self.default_sample_size = 128
+        self.tokenizer_max_length = 512
+
+    def _encode_prompt(self, prompt, image):
+        raw_vl_input = self.image_processor_vl(images=image, return_tensors="pt")
+        pixel_values = raw_vl_input["pixel_values"]
+        image_grid_thw = raw_vl_input["image_grid_thw"]
+        all_tokens = []
+        for clean_prompt_sub, matched in split_quotation(prompt[0]):
+            if matched:
+                for sub_word in clean_prompt_sub:
+                    tokens = self.tokenizer(sub_word, add_special_tokens=False)["input_ids"]
+                    all_tokens.extend(tokens)
+            else:
+                tokens = self.tokenizer(clean_prompt_sub, add_special_tokens=False)["input_ids"]
+                all_tokens.extend(tokens)
+
+        if len(all_tokens) > self.tokenizer_max_length:
+            logger.warning(
+                "Your input was truncated because `max_sequence_length` is set to "
+                f" {self.tokenizer_max_length} input token nums : {len(len(all_tokens))}"
+            )
+            all_tokens = all_tokens[: self.tokenizer_max_length]
+
+        text_tokens_and_mask = self.tokenizer.pad(
+            {"input_ids": [all_tokens]},
+            max_length=self.tokenizer_max_length,
+            padding="max_length",
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+
+        text = self.prompt_template_encode_prefix
+
+        merge_length = self.image_processor_vl.merge_size**2
+        while self.image_token in text:
+            num_image_tokens = image_grid_thw.prod() // merge_length
+            text = text.replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+        text = text.replace("<|placeholder|>", self.image_token)
+
+        prefix_tokens = self.tokenizer(text, add_special_tokens=False)["input_ids"]
+        suffix_tokens = self.tokenizer(self.prompt_template_encode_suffix, add_special_tokens=False)["input_ids"]
+        prefix_len = len(prefix_tokens)
+        suffix_len = len(suffix_tokens)
+
+        prefix_tokens_mask = torch.tensor([1] * len(prefix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
+        suffix_tokens_mask = torch.tensor([1] * len(suffix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
+
+        prefix_tokens = torch.tensor(prefix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
+        suffix_tokens = torch.tensor(suffix_tokens, dtype=text_tokens_and_mask.input_ids.dtype)
+
+        input_ids = torch.cat((prefix_tokens, text_tokens_and_mask.input_ids[0], suffix_tokens), dim=-1)
+        attention_mask = torch.cat(
+            (prefix_tokens_mask, text_tokens_and_mask.attention_mask[0], suffix_tokens_mask), dim=-1
+        )
+
+        input_ids = input_ids.unsqueeze(0).to(self.device)
+        attention_mask = attention_mask.unsqueeze(0).to(self.device)
+
+        pixel_values = pixel_values.to(self.device)
+        image_grid_thw = image_grid_thw.to(self.device)
+
+        text_output = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            output_hidden_states=True,
+        )
+        # [max_sequence_length, batch, hidden_size] -> [batch, max_sequence_length, hidden_size]
+        # clone to have a contiguous tensor
+        prompt_embeds = text_output.hidden_states[-1].detach()
+        prompt_embeds = prompt_embeds[:, prefix_len:-suffix_len, :]
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: List[str] = None,
+        image: Optional[torch.Tensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is None:
+            prompt_embeds = self._encode_prompt(prompt, image)
+
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        text_ids = prepare_pos_ids(modality_id=0, type="text", start=(0, 0), num_token=prompt_embeds.shape[1]).to(
+            self.device
+        )
+        return prompt_embeds, text_ids
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+
+        return latents
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+
+        return image_latents
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    def prepare_latents(
+        self,
+        image,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        prompt_embeds_length,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        image_latents, image_latents_ids = None, None
+
+        if image is not None:
+            image = image.to(device=self.device, dtype=dtype)
+
+            if image.shape[1] != self.vae.config.latent_channels:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            else:
+                image_latents = image
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+
+            image_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height, width)
+
+            image_latents_ids = prepare_pos_ids(
+                modality_id=2,
+                type="image",
+                start=(prompt_embeds_length, prompt_embeds_length),
+                height=height // 2,
+                width=width // 2,
+            ).to(device, dtype=torch.float64)
+
+        shape = (batch_size, num_channels_latents, height, width)
+        latents_ids = prepare_pos_ids(
+            modality_id=1,
+            type="image",
+            start=(prompt_embeds_length, prompt_embeds_length),
+            height=height // 2,
+            width=width // 2,
+        ).to(device)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        return latents, image_latents, latents_ids, image_latents_ids
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    def check_inputs(
+        self, prompt, height, width, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None:
+            if isinstance(prompt, str):
+                pass
+            elif isinstance(prompt, list) and len(prompt) == 1:
+                pass
+            else:
+                raise ValueError(
+                    f"`prompt` must be a `str` or a `list` of length 1, but is {prompt} (type: {type(prompt)})"
+                )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[PIL.Image.Image] = None,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.LongCatImagePipelineOutput`] or `tuple`: [`~pipelines.LongCatImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+
+        image_size = image[0].size if isinstance(image, list) else image.size
+        calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_size[0] * 1.0 / image_size[1])
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            calculated_height,
+            calculated_width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Preprocess image
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            image = self.image_processor.resize(image, calculated_height, calculated_width)
+            prompt_image = self.image_processor.resize(image, calculated_height // 2, calculated_width // 2)
+            image = self.image_processor.preprocess(image, calculated_height, calculated_width)
+
+        negative_prompt = "" if negative_prompt is None else negative_prompt
+        (prompt_embeds, text_ids) = self.encode_prompt(
+            prompt=prompt, image=prompt_image, prompt_embeds=prompt_embeds, num_images_per_prompt=num_images_per_prompt
+        )
+        if self.do_classifier_free_guidance:
+            (negative_prompt_embeds, negative_text_ids) = self.encode_prompt(
+                prompt=negative_prompt,
+                image=prompt_image,
+                prompt_embeds=negative_prompt_embeds,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = 16
+        latents, image_latents, latents_ids, image_latents_ids = self.prepare_latents(
+            image,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            calculated_height,
+            calculated_width,
+            prompt_embeds.dtype,
+            prompt_embeds.shape[1],
+            device,
+            generator,
+            latents,
+        )
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1.0 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        guidance = None
+
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+
+        if image is not None:
+            latent_image_ids = torch.cat([latents_ids, image_latents_ids], dim=0)
+        else:
+            latent_image_ids = latents_ids
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+
+                # latent_model_input = torch.cat([latent_model_input] * 2) if self.do_classifier_free_guidance else latent_model_input
+                timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
+                with self.transformer.cache_context("cond"):
+                    noise_pred_text = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        encoder_hidden_states=prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        return_dict=False,
+                    )[0]
+                    noise_pred_text = noise_pred_text[:, :image_seq_len]
+                if self.do_classifier_free_guidance:
+                    with self.transformer.cache_context("uncond"):
+                        noise_pred_uncond = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep / 1000,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=negative_text_ids,
+                            img_ids=latent_image_ids,
+                            return_dict=False,
+                        )[0]
+                        noise_pred_uncond = noise_pred_uncond[:, :image_seq_len]
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_text
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, calculated_height, calculated_width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            if latents.dtype != self.vae.dtype:
+                latents = latents.to(dtype=self.vae.dtype)
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return LongCatImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/longcat_image/pipeline_output.py b/src/diffusers/pipelines/longcat_image/pipeline_output.py
new file mode 100644
index 0000000000..e3c25f1cbf
--- /dev/null
+++ b/src/diffusers/pipelines/longcat_image/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class LongCatImagePipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/src/diffusers/pipelines/longcat_image/system_messages.py b/src/diffusers/pipelines/longcat_image/system_messages.py
new file mode 100644
index 0000000000..b8b2318e4e
--- /dev/null
+++ b/src/diffusers/pipelines/longcat_image/system_messages.py
@@ -0,0 +1,142 @@
+SYSTEM_PROMPT_EN = """
+You are a prompt engineering expert for text-to-image models. Since text-to-image models have limited capabilities in
+understanding user prompts, you need to identify the core theme and intent of the user's input and improve the model's
+understanding accuracy and generation quality through optimization and rewriting. The rewrite must strictly retain all
+information from the user's original prompt without deleting or distorting any details. Specific requirements are as
+follows:
+1. The rewrite must not affect any information expressed in the user's original prompt; the rewritten prompt should use
+   coherent natural language, avoid low-information redundant descriptions, and keep the rewritten prompt length as
+   concise as possible.
+2. Ensure consistency between input and output languages: Chinese input yields Chinese output, and English input yields
+   English output. The rewritten token count should not exceed 512.
+3. The rewritten description should further refine subject characteristics and aesthetic techniques appearing in the
+   original prompt, such as lighting and textures.
+4. If the original prompt does not specify an image style, ensure the rewritten prompt uses a **realistic photography
+   style**. If the user specifies a style, retain the user's style.
+5. When the original prompt requires reasoning to clarify user intent, use logical reasoning based on world knowledge
+   to convert vague abstract descriptions into specific tangible objects (e.g., convert "the tallest animal" to "a
+   giraffe").
+6. When the original prompt requires text generation, please use double quotes to enclose the text part (e.g., `"50%
+   OFF"`).
+7. When the original prompt requires generating text-heavy scenes like webpages, logos, UIs, or posters, and no
+   specific text content is specified, you need to infer appropriate text content and enclose it in double quotes. For
+   example, if the user inputs: "A tourism flyer with a grassland theme," it should be rewritten as: "A tourism flyer
+   with the image title 'Grassland'."
+8. When negative words exist in the original prompt, ensure the rewritten prompt does not contain negative words. For
+   example, "a lakeside without boats" should be rewritten such that the word "boat" does not appear at all.
+9. Except for text content explicitly requested by the user, **adding any extra text content is prohibited**.
+Here are examples of rewrites for different types of prompts: # Examples (Few-Shot Learning)
+  1. User Input: An animal with nine lives.
+    Rewrite Output: A cat bathed in soft sunlight, its fur soft and glossy. The background is a comfortable home
+    environment with light from the window filtering through curtains, creating a warm light and shadow effect. The
+    shot uses a medium distance perspective to highlight the cat's leisurely and stretched posture. Light cleverly hits
+    the cat's face, emphasizing its spirited eyes and delicate whiskers, adding depth and affinity to the image.
+  2. User Input: Create an anime-style tourism flyer with a grassland theme.
+    Rewrite Output: In the lower right of the center, a short-haired girl sits sideways on a gray, irregularly shaped
+    rock. She wears a white short-sleeved dress and brown flat shoes, holding a bunch of small white flowers in her
+    left hand, smiling with her legs hanging naturally. The girl has dark brown shoulder-length hair with bangs
+    covering her forehead, brown eyes, and a slightly open mouth. The rock surface has textures of varying depths. To
+    the girl's left and front is lush grass, with long, yellow-green blades, some glowing golden in the sunlight. The
+    grass extends into the distance, forming rolling green hills that fade in color as they recede. The sky occupies
+    the upper half of the picture, pale blue dotted with a few fluffy white clouds. In the upper left corner, there is
+    a line of text in italic, dark green font reading "Explore Nature's Peace". Colors are dominated by green, blue,
+    and yellow, fluid lines, and distinct light and shadow contrast, creating a quiet and comfortable atmosphere.
+  3. User Input: A Christmas sale poster with a red background, promoting a Buy 1 Get 1 Free milk tea offer.
+    Rewrite Output: The poster features an overall red tone, embellished with white snowflake patterns on the top and
+    left side. The upper right features a bunch of holly leaves with red berries and a pine cone. In the upper center,
+    golden 3D text reads "Christmas Heartwarming Feedback" centered, along with red bold text "Buy 1 Get 1". Below, two
+    transparent cups filled with bubble tea are placed side by side; the tea is light brown with dark brown pearls
+    scattered at the bottom and middle. Below the cups, white snow piles up, decorated with pine branches, red berries,
+    and pine cones. A blurry Christmas tree is faintly visible in the lower right corner. The image has high clarity,
+    accurate text content, a unified design style, a prominent Christmas theme, and a reasonable layout, providing
+    strong visual appeal.
+  4. User Input: A woman indoors shot in natural light, smiling with arms crossed, showing a relaxed and confident
+     posture.
+    Rewrite Output: The image features a young Asian woman with long dark brown hair naturally falling over her
+    shoulders, with some strands illuminated by light, showing a soft sheen. Her features are delicate, with long
+    eyebrows, bright and spirited dark brown eyes looking directly at the camera, revealing peace and confidence. She
+    has a high nose bridge, full lips with nude lipstick, and corners of the mouth slightly raised in a faint smile.
+    Her skin is fair, with cheeks and collarbones illuminated by warm light, showing a healthy ruddiness. She wears a
+    black spaghetti strap tank top revealing graceful collarbone lines, and a thin gold necklace with small beads and
+    metal bars glinting in the light. Her outer layer is a beige knitted cardigan, soft in texture with visible
+    knitting patterns on the sleeves. Her arms are crossed over her chest, hands covered by the cardigan sleeves, in a
+    relaxed posture. The background is a pure dark brown without extra decoration, making the figure the absolute
+    focus. The figure is located in the center of the frame. Light enters from the upper right, creating bright spots
+    on her left cheek, neck, and collarbone, while the right side is slightly shadowed, creating a three-dimensional
+    and soft tone. Image details are clear, showcasing skin texture, hair, and clothing materials well. Colors are
+    dominated by warm tones, with the combination of beige and dark brown creating a warm and comfortable atmosphere.
+    The overall style is natural, elegant, and artistic.
+  5. User Input: Create a series of images showing the growth process of an apple from seed to fruit. The series should
+     include four stages: 1. Sowing, 2. Seedling growth, 3. Plant maturity, 4. Fruit harvesting.
+    Rewrite Output: A 4-panel exquisite illustration depicting the growth process of an apple, capturing each stage
+    precisely and clearly. 1. "Sowing": A close-up shot of a hand gently placing a small apple seed into fertile dark
+    soil, with visible soil texture and the seed's smooth surface. The background is a soft-focus garden dotted with
+    green leaves and sunlight filtering through. 2. "Seedling Growth": A young apple sapling breaks through the soil,
+    stretching tender green leaves toward the sky. The scene is set in a vibrant garden illuminated by warm golden
+    light, highlighting the seedling's delicate structure. 3. "Plant Maturity": A mature apple tree, lush with branches
+    and leaves, covered in tender green foliage and developing small apples. The background is a vibrant orchard under
+    a clear blue sky, with dappled sunlight creating a peaceful atmosphere. 4. "Fruit Harvesting": A hand reaches into
+    the tree to pick a ripe red apple, its smooth skin glistening in the sun. The scene shows the abundance of the
+    orchard, with baskets of apples in the background, giving a sense of fulfillment. Each illustration uses a
+    realistic style, focusing on details and harmonious colors to showcase the natural beauty and development of the
+    apple's life cycle.
+  6. User Input: If 1 represents red, 2 represents green, 3 represents purple, and 4 represents yellow, please generate
+     a four-color rainbow based on this rule. The color order from top to bottom is 3142.
+    Rewrite Output: The image consists of four horizontally arranged colored stripes, ordered from top to bottom as
+    purple, red, yellow, and green. A white number is centered on each stripe. The top purple stripe features the
+    number "3", the red stripe below it has the number "1", the yellow stripe further down has the number "4", and the
+    bottom green stripe has the number "2". All numbers use a sans-serif font in pure white, forming a sharp contrast
+    with the background colors to ensure good readability. The stripes have high color saturation and a slight texture.
+    The overall layout is simple and clear, with distinct visual effects and no extra decorative elements, emphasizing
+    the numerical information. The image is high definition, with accurate colors and a consistent style, offering
+    strong visual appeal.
+  7. User Input: A stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", natural light, background is a
+     Chinese garden.
+    Rewrite Output: An ancient stone tablet carved with "Guan Guan Ju Jiu, On the River Isle", the surface covered with
+    traces of time, the writing clear and deep. Natural light falls from above, softly illuminating every detail of the
+    stone tablet and enhancing its sense of history. The background is an elegant Chinese garden featuring lush bamboo
+    forests, winding paths, and quiet pools, creating a serene and distant atmosphere. The overall picture uses a
+    realistic style with rich details and natural light and shadow effects, highlighting the cultural heritage of the
+    stone tablet and the classical beauty of the garden.
+# Output Format Please directly output the rewritten and optimized Prompt content. Do not include any explanatory
+language or JSON formatting, and do not add opening or closing quotes yourself."""
+
+
+SYSTEM_PROMPT_ZH = """
+你是一名文生图模型的prompt
+engineering专家。由于文生图模型对用户prompt的理解能力有限，你需要识别用户输入的核心主题和意图，并通过优化改写提升模型的理解准确性和生成质量。改写必须严格保留用户原始prompt的所有信息，不得删减或曲解任何细节。
+具体要求如下：
+1. 改写不能影响用户原始prompt里表达的任何信息，改写后的prompt应该使用连贯的自然语言表达,不要出现低信息量的冗余描述，尽可能保持改写后prompt长度精简。
+2. 请确保输入和输出的语言类型一致，中文输入中文输出，英文输入英文输出，改写后的token数量不要超过512个;
+3. 改写后的描述应当进一步完善原始prompt中出现的主体特征、美学技巧，如打光、纹理等；
+4. 如果原始prompt没有指定图片风格时，确保改写后的prompt使用真实摄影风格，如果用户指定了图片风格，则保留用户风格；
+5. 当原始prompt需要推理才能明确用户意图时，根据世界知识进行适当逻辑推理，将模糊抽象描述转化为具体指向事物（例：将"最高的动物"转化为"一头长颈鹿"）。
+6. 当原始prompt需要生成文字时，请使用双引号圈定文字部分，例：`"限时5折"`）。
+7. 当原始prompt需要生成网页、logo、ui、海报等文字场景时，且没有指定具体的文字内容时，需要推断出合适的文字内容，并使用双引号圈定，如用户输入：一个旅游宣传单，以草原为主题。应该改写成：一个旅游宣传单，图片标题为“草原”。
+8. 当原始prompt中存在否定词时，需要确保改写后的prompt不存在否定词，如没有船的湖边，改写后的prompt不能出现船这个词汇。
+9. 除非用户指定生成品牌logo，否则不要增加额外的品牌logo.
+10. 除了用户明确要求书写的文字内容外，**禁止增加任何额外的文字内容**。
+以下是针对不同类型prompt改写的示例：
+
+# Examples (Few-Shot Learning)
+  1. 用户输入: 九条命的动物。
+    改写输出:
+    一只猫，被柔和的阳光笼罩着，毛发柔软而富有光泽。背景是一个舒适的家居环境，窗外的光线透过窗帘，形成温馨的光影效果。镜头采用中距离视角，突出猫悠闲舒展的姿态。光线巧妙地打在猫的脸部，强调它灵动的眼睛和精致的胡须，增加画面的层次感与亲和力。
+  2. 用户输入: 制作一个动画风格的旅游宣传单，以草原为主题。
+    改写输出:
+    画面中央偏右下角，一个短发女孩侧身坐在灰色的不规则形状岩石上，她穿着白色短袖连衣裙和棕色平底鞋，左手拿着一束白色小花，面带微笑，双腿自然垂下。女孩的头发为深棕色，齐肩短发，刘海覆盖额头，眼睛呈棕色，嘴巴微张。岩石表面有深浅不一的纹理。女孩的左侧和前方是茂盛的草地，草叶细长，呈黄绿色，部分草叶在阳光下泛着金色的光芒，仿佛被阳光照亮。草地向远处延伸，形成连绵起伏的绿色山丘，山丘的颜色由近及远逐渐变浅。天空占据了画面的上半部分，呈淡蓝色，点缀着几朵白色蓬松的云彩。画面的左上角有一行文字，文字内容是斜体、深绿色的“Explore
+    Nature's Peace”。色彩以绿色、蓝色和黄色为主，线条流畅，光影明暗对比明显，营造出一种宁静、舒适的氛围。
+  3. 用户输入: 一张以红色为背景的圣诞节促销海报，主要宣传奶茶买一送一的优惠活动。
+    改写输出: 海报整体呈现红色调，上方和左侧点缀着白色雪花图案，右上方有一束冬青叶和红色浆果，以及一个松果。海报中央偏上位置，金色立体字样“圣诞节
+    暖心回馈”居中排列，和红色粗体字“买1送1”。海报下方，两个装满珍珠奶茶的透明杯子并排摆放，杯中奶茶呈浅棕色，底部和中间散布着深棕色珍珠。杯子下方，堆积着白色雪花，雪花上装饰着松枝、红色浆果和松果。右下角隐约可见一棵模糊的圣诞树。图片清晰度高，文字内容准确，整体设计风格统一，圣诞主题突出，排版布局合理，具有较强的视觉吸引力。
+  4. 用户输入: 一位女性在室内以自然光线拍摄，她面带微笑，双臂交叉，展现出轻松自信的姿态。
+    改写输出:
+    画面中是一位年轻的亚洲女性，她拥有深棕色的长发，发丝自然地垂落在双肩，部分发丝被光线照亮，呈现出柔和的光泽。她的五官精致，眉毛修长，眼睛明亮有神，瞳孔呈深棕色，眼神直视镜头，流露出平和与自信。鼻梁挺拔，嘴唇丰满，涂有裸色系唇膏，嘴角微微上扬，展现出浅浅的微笑。她的肤色白皙，脸颊和锁骨处被暖色调的光线照亮，呈现出健康的红润感。她穿着一件黑色的细吊带背心，肩带纤细，露出优美的锁骨线条。脖颈上佩戴着一条金色的细项链，项链由小珠子和几个细长的金属条组成，在光线下闪烁着光泽。她的外搭是一件米黄色的针织开衫，材质柔软，袖子部分有明显的针织纹理。她双臂交叉在胸前，双手被开衫的袖子覆盖，姿态放松。背景是纯粹的深棕色，没有多余的装饰，使得人物成为画面的绝对焦点。人物位于画面中央。光线从画面的右上方射入，在人物的左侧脸颊、脖颈和锁骨处形成明亮的光斑，右侧则略显阴影，营造出立体感和柔和的影调。图像细节清晰，人物的皮肤纹理、发丝以及衣物材质都得到了很好的展现。色彩以暖色调为主，米黄色和深棕色的搭配营造出温馨舒适的氛围。整体呈现出一种自然、优雅且富有亲和力的艺术风格。
+  5. 用户输入：创作一系列图片，展现苹果从种子到结果的生长过程。该系列图片应包含以下四个阶段：1. 播种，2. 幼苗生长，3. 植物成熟，4. 果实采摘。
+    改写输出：一个4宫格的精美插图，描绘苹果的生长过程，精确清晰地捕捉每个阶段。1.“播种”：特写镜头，一只手轻轻地将一颗小小的苹果种子放入肥沃的深色土壤中，土壤的纹理和种子光滑的表面清晰可见。背景是花园的柔焦画面，点缀着绿色的树叶和透过树叶洒下的阳光。2.“幼苗生长”：一棵幼小的苹果树苗破土而出，嫩绿的叶子向天空舒展。场景设定在一个生机勃勃的花园中，温暖的金光照亮了它。幼苗的纤细结构。3.“植物的成熟”：一棵成熟的苹果树，枝繁叶茂，挂满了嫩绿的叶子和正在萌发的小苹果。背景是一片生机勃勃的果园，湛蓝的天空下，斑驳的阳光营造出宁静祥和的氛围。4.“采摘果实”：一只手伸向树上，摘下一个成熟的红苹果，苹果光滑的果皮在阳光下闪闪发光。画面展现了果园的丰收景象，背景中摆放着一篮篮的苹果，给人一种圆满满足的感觉。每幅插图都采用写实风格，注重细节，色彩和谐，展现了苹果生命周期的自然之美和发展过程。
+  6. 用户输入： 如果1代表红色，2代表绿色，3代表紫色，4代表黄色，请按照此规则生成四色彩虹。它的颜色顺序从上到下是3142
+    改写输出：图片由四个水平排列的彩色条纹组成，从上到下依次为紫色、红色、黄色和绿色。每个条纹上都居中放置一个白色数字。最上方的紫色条纹上是数字“3”，其下方红色条纹上是数字“1”，再下方黄色条纹上是数字“4”，最下方的绿色条纹上是数字“2”。所有数字均采用无衬线字体，颜色为纯白色，与背景色形成鲜明对比，确保了良好的可读性。条纹的颜色饱和度高，且带有轻微的纹理感，整体排版简洁明了，视觉效果清晰，没有多余的装饰元素，强调了数字信息本身。图片整体清晰度高，色彩准确，风格一致，具有较强的视觉吸引力。
+  7. 用户输入：石碑上刻着“关关雎鸠，在河之洲”，自然光照，背景是中式园林
+    改写输出：一块古老的石碑上刻着“关关雎鸠，在河之洲”，石碑表面布满岁月的痕迹，字迹清晰而深刻。自然光线从上方洒下，柔和地照亮石碑的每一个细节，增强了其历史感。背景是一座典雅的中式园林，园林中有翠绿的竹林、蜿蜒的小径和静谧的水池，营造出一种宁静而悠远的氛围。整体画面采用写实风格，细节丰富，光影效果自然，突出了石碑的文化底蕴和园林的古典美。
+# 输出格式 请直接输出改写优化后的 Prompt 内容，不要包含任何解释性语言或 JSON 格式，不要自行添加开头或结尾的引号。
+"""
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 8628893200..606fea18c7 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1132,6 +1132,21 @@ class LatteTransformer3DModel(metaclass=DummyObject):
         requires_backends(cls, ["torch"])
 
 
+class LongCatImageTransformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class LTXVideoTransformer3DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index ff65372f3c..be7f8f8ce4 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -1832,6 +1832,36 @@ class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class LongCatImageEditPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LongCatImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class LTXConditionPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/longcat_image/__init__.py b/tests/pipelines/longcat_image/__init__.py
new file mode 100644
index 0000000000..e69de29bb2

From 5e48f466b9c0d257f2650e8feec378a0022f2402 Mon Sep 17 00:00:00 2001
From: junqiangwu <32137981+junqiangwu@users.noreply.github.com>
Date: Tue, 16 Dec 2025 16:02:25 +0800
Subject: [PATCH 05/11] fix the prefix_token_len bug (#12845)

---
 .../pipelines/longcat_image/pipeline_longcat_image_edit.py   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
index 988aef2125..e55a2a47f3 100644
--- a/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
+++ b/src/diffusers/pipelines/longcat_image/pipeline_longcat_image_edit.py
@@ -306,7 +306,9 @@ class LongCatImageEditPipeline(DiffusionPipeline, FromSingleFileMixin):
 
         prefix_tokens = self.tokenizer(text, add_special_tokens=False)["input_ids"]
         suffix_tokens = self.tokenizer(self.prompt_template_encode_suffix, add_special_tokens=False)["input_ids"]
-        prefix_len = len(prefix_tokens)
+
+        vision_start_token_id = self.tokenizer.convert_tokens_to_ids("<|vision_start|>")
+        prefix_len = prefix_tokens.index(vision_start_token_id)
         suffix_len = len(suffix_tokens)
 
         prefix_tokens_mask = torch.tensor([1] * len(prefix_tokens), dtype=text_tokens_and_mask.attention_mask[0].dtype)
@@ -660,7 +662,6 @@ class LongCatImageEditPipeline(DiffusionPipeline, FromSingleFileMixin):
                 if image_latents is not None:
                     latent_model_input = torch.cat([latents, image_latents], dim=1)
 
-                # latent_model_input = torch.cat([latent_model_input] * 2) if self.do_classifier_free_guidance else latent_model_input
                 timestep = t.expand(latent_model_input.shape[0]).to(latents.dtype)
                 with self.transformer.cache_context("cond"):
                     noise_pred_text = self.transformer(

From 87f7d111437e1dad2a25d4653c57886f8f058cd3 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 17 Dec 2025 16:14:08 +0800
Subject: [PATCH 06/11] extend TorchAoTest::test_model_memory_usage to other
 platform (#12768)

* extend TorchAoTest::test_model_memory_usage to other platform

Signe-off-by: Wang, Yi <yi.a.wang@inel.com>

* add some comments

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
---
 tests/quantization/torchao/test_torchao.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
index 38997de17b..e6bfc2530a 100644
--- a/tests/quantization/torchao/test_torchao.py
+++ b/tests/quantization/torchao/test_torchao.py
@@ -35,6 +35,7 @@ from diffusers.models.attention_processor import Attention
 from diffusers.quantizers import PipelineQuantizationConfig
 
 from ...testing_utils import (
+    Expectations,
     backend_empty_cache,
     backend_synchronize,
     enable_full_determinism,
@@ -497,8 +498,23 @@ class TorchAoTest(unittest.TestCase):
 
     def test_model_memory_usage(self):
         model_id = "hf-internal-testing/tiny-flux-pipe"
-        expected_memory_saving_ratio = 2.0
-
+        expected_memory_saving_ratios = Expectations(
+            {
+                # XPU: For this tiny model, per-tensor overheads (alignment, fragmentation, metadata) become visible.
+                # While XPU doesn't have the large fixed cuBLAS workspace of A100, these small overheads prevent reaching the ideal 2.0 ratio.
+                # Observed ~1.27x (158k vs 124k) for model size.
+                # The runtime memory overhead is ~88k for both bf16 and int8wo. Adding this to model size: (158k+88k)/(124k+88k) ≈ 1.15.
+                ("xpu", None): 1.15,
+                # On Ampere, the cuBLAS kernels used for matrix multiplication often allocate a fixed-size workspace.
+                # Since the tiny-flux model weights are likely smaller than or comparable to this workspace, the total memory is dominated by the workspace.
+                ("cuda", 8): 1.02,
+                # On Hopper, TorchAO utilizes newer, highly optimized kernels (via Triton or CUTLASS 3.x) that are designed to be workspace-free or use negligible extra memory.
+                # Additionally, Triton kernels often handle unaligned memory better, avoiding the padding overhead seen on other backends for tiny tensors.
+                # This allows it to achieve the near-ideal 2.0x compression ratio.
+                ("cuda", 9): 2.0,
+            }
+        )
+        expected_memory_saving_ratio = expected_memory_saving_ratios.get_expectation()
         inputs = self.get_dummy_tensor_inputs(device=torch_device)
 
         transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"]

From f9c1e612fb85dd971beeba77c3ddc0826e2146a4 Mon Sep 17 00:00:00 2001
From: naykun <yankun1138283845@foxmail.com>
Date: Wed, 17 Dec 2025 19:27:57 +0800
Subject: [PATCH 07/11] Qwen Image Layered Support (#12853)

* [qwen-image] qwen image layered support

* [qwen-image] update doc

* [qwen-image] fix pr comments

* Apply style fixes

* make fix-copies

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 src/diffusers/__init__.py                     |   2 +
 .../autoencoders/autoencoder_kl_qwenimage.py  |  11 +-
 .../transformers/transformer_qwenimage.py     | 143 ++-
 src/diffusers/pipelines/__init__.py           |   2 +
 src/diffusers/pipelines/qwenimage/__init__.py |   2 +
 .../qwenimage/pipeline_qwenimage_layered.py   | 905 ++++++++++++++++++
 .../dummy_torch_and_transformers_objects.py   |  15 +
 7 files changed, 1070 insertions(+), 10 deletions(-)
 create mode 100644 src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 86f196f9be..aec7efd1ff 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -564,6 +564,7 @@ else:
             "QwenImageEditPlusPipeline",
             "QwenImageImg2ImgPipeline",
             "QwenImageInpaintPipeline",
+            "QwenImageLayeredPipeline",
             "QwenImagePipeline",
             "ReduxImageEncoder",
             "SanaControlNetPipeline",
@@ -1272,6 +1273,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             QwenImageEditPlusPipeline,
             QwenImageImg2ImgPipeline,
             QwenImageInpaintPipeline,
+            QwenImageLayeredPipeline,
             QwenImagePipeline,
             ReduxImageEncoder,
             SanaControlNetPipeline,
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
index 618801dfb6..7f7266146e 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py
@@ -394,6 +394,7 @@ class QwenImageEncoder3d(nn.Module):
         attn_scales=[],
         temperal_downsample=[True, True, False],
         dropout=0.0,
+        input_channels=3,
         non_linearity: str = "silu",
     ):
         super().__init__()
@@ -410,7 +411,7 @@ class QwenImageEncoder3d(nn.Module):
         scale = 1.0
 
         # init block
-        self.conv_in = QwenImageCausalConv3d(3, dims[0], 3, padding=1)
+        self.conv_in = QwenImageCausalConv3d(input_channels, dims[0], 3, padding=1)
 
         # downsample blocks
         self.down_blocks = nn.ModuleList([])
@@ -570,6 +571,7 @@ class QwenImageDecoder3d(nn.Module):
         attn_scales=[],
         temperal_upsample=[False, True, True],
         dropout=0.0,
+        input_channels=3,
         non_linearity: str = "silu",
     ):
         super().__init__()
@@ -621,7 +623,7 @@ class QwenImageDecoder3d(nn.Module):
 
         # output blocks
         self.norm_out = QwenImageRMS_norm(out_dim, images=False)
-        self.conv_out = QwenImageCausalConv3d(out_dim, 3, 3, padding=1)
+        self.conv_out = QwenImageCausalConv3d(out_dim, input_channels, 3, padding=1)
 
         self.gradient_checkpointing = False
 
@@ -684,6 +686,7 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig
         attn_scales: List[float] = [],
         temperal_downsample: List[bool] = [False, True, True],
         dropout: float = 0.0,
+        input_channels: int = 3,
         latents_mean: List[float] = [-0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508, 0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921],
         latents_std: List[float] = [2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743, 3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160],
     ) -> None:
@@ -695,13 +698,13 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig
         self.temperal_upsample = temperal_downsample[::-1]
 
         self.encoder = QwenImageEncoder3d(
-            base_dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout
+            base_dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout, input_channels
         )
         self.quant_conv = QwenImageCausalConv3d(z_dim * 2, z_dim * 2, 1)
         self.post_quant_conv = QwenImageCausalConv3d(z_dim, z_dim, 1)
 
         self.decoder = QwenImageDecoder3d(
-            base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout
+            base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout, input_channels
         )
 
         self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
index 3adfcdb147..1229bab169 100644
--- a/src/diffusers/models/transformers/transformer_qwenimage.py
+++ b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -143,17 +143,26 @@ def apply_rotary_emb_qwen(
 
 
 class QwenTimestepProjEmbeddings(nn.Module):
-    def __init__(self, embedding_dim):
+    def __init__(self, embedding_dim, use_additional_t_cond=False):
         super().__init__()
 
         self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
         self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.use_additional_t_cond = use_additional_t_cond
+        if use_additional_t_cond:
+            self.addition_t_embedding = nn.Embedding(2, embedding_dim)
 
-    def forward(self, timestep, hidden_states):
+    def forward(self, timestep, hidden_states, addition_t_cond=None):
         timesteps_proj = self.time_proj(timestep)
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
 
         conditioning = timesteps_emb
+        if self.use_additional_t_cond:
+            if addition_t_cond is None:
+                raise ValueError("When additional_t_cond is True, addition_t_cond must be provided.")
+            addition_t_emb = self.addition_t_embedding(addition_t_cond)
+            addition_t_emb = addition_t_emb.to(dtype=hidden_states.dtype)
+            conditioning = conditioning + addition_t_emb
 
         return conditioning
 
@@ -259,6 +268,120 @@ class QwenEmbedRope(nn.Module):
         return freqs.clone().contiguous()
 
 
+class QwenEmbedLayer3DRope(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+
+        self.scale_rope = scale_rope
+
+    def rope_params(self, index, dim, theta=10000):
+        """
+        Args:
+            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+        """
+        assert dim % 2 == 0
+        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
+        freqs = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs
+
+    def forward(self, video_fhw, txt_seq_lens, device):
+        """
+        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
+        txt_length: [bs] a list of 1 integers representing the length of the text
+        """
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        if not isinstance(video_fhw, list):
+            video_fhw = [video_fhw]
+
+        vid_freqs = []
+        max_vid_index = 0
+        layer_num = len(video_fhw) - 1
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            if idx != layer_num:
+                video_freq = self._compute_video_freqs(frame, height, width, idx)
+            else:
+                ### For the condition image, we set the layer index to -1
+                video_freq = self._compute_condition_freqs(frame, height, width)
+            video_freq = video_freq.to(device)
+            vid_freqs.append(video_freq)
+
+            if self.scale_rope:
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
+            else:
+                max_vid_index = max(height, width, max_vid_index)
+
+        max_vid_index = max(max_vid_index, layer_num)
+        max_len = max(txt_seq_lens)
+        txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
+
+        return vid_freqs, txt_freqs
+
+    @functools.lru_cache(maxsize=None)
+    def _compute_video_freqs(self, frame, height, width, idx=0):
+        seq_lens = frame * height * width
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+
+        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+
+    @functools.lru_cache(maxsize=None)
+    def _compute_condition_freqs(self, frame, height, width):
+        seq_lens = frame * height * width
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+
+        freqs_frame = freqs_neg[0][-1:].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+
+
 class QwenDoubleStreamAttnProcessor2_0:
     """
     Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
@@ -578,14 +701,21 @@ class QwenImageTransformer2DModel(
         guidance_embeds: bool = False,  # TODO: this should probably be removed
         axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
         zero_cond_t: bool = False,
+        use_additional_t_cond: bool = False,
+        use_layer3d_rope: bool = False,
     ):
         super().__init__()
         self.out_channels = out_channels or in_channels
         self.inner_dim = num_attention_heads * attention_head_dim
 
-        self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        if not use_layer3d_rope:
+            self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        else:
+            self.pos_embed = QwenEmbedLayer3DRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
 
-        self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)
+        self.time_text_embed = QwenTimestepProjEmbeddings(
+            embedding_dim=self.inner_dim, use_additional_t_cond=use_additional_t_cond
+        )
 
         self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
 
@@ -621,6 +751,7 @@ class QwenImageTransformer2DModel(
         guidance: torch.Tensor = None,  # TODO: this should probably be removed
         attention_kwargs: Optional[Dict[str, Any]] = None,
         controlnet_block_samples=None,
+        additional_t_cond=None,
         return_dict: bool = True,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
         """
@@ -683,9 +814,9 @@ class QwenImageTransformer2DModel(
             guidance = guidance.to(hidden_states.dtype) * 1000
 
         temb = (
-            self.time_text_embed(timestep, hidden_states)
+            self.time_text_embed(timestep, hidden_states, additional_t_cond)
             if guidance is None
-            else self.time_text_embed(timestep, guidance, hidden_states)
+            else self.time_text_embed(timestep, guidance, hidden_states, additional_t_cond)
         )
 
         image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=hidden_states.device)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index ff5cd829ce..a2a374906b 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -422,6 +422,7 @@ else:
         "QwenImageEditInpaintPipeline",
         "QwenImageControlNetInpaintPipeline",
         "QwenImageControlNetPipeline",
+        "QwenImageLayeredPipeline",
     ]
     _import_structure["chronoedit"] = ["ChronoEditPipeline"]
 try:
@@ -764,6 +765,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             QwenImageEditPlusPipeline,
             QwenImageImg2ImgPipeline,
             QwenImageInpaintPipeline,
+            QwenImageLayeredPipeline,
             QwenImagePipeline,
         )
         from .sana import (
diff --git a/src/diffusers/pipelines/qwenimage/__init__.py b/src/diffusers/pipelines/qwenimage/__init__.py
index 2400632ba2..3f43d0ebb0 100644
--- a/src/diffusers/pipelines/qwenimage/__init__.py
+++ b/src/diffusers/pipelines/qwenimage/__init__.py
@@ -31,6 +31,7 @@ else:
     _import_structure["pipeline_qwenimage_edit_plus"] = ["QwenImageEditPlusPipeline"]
     _import_structure["pipeline_qwenimage_img2img"] = ["QwenImageImg2ImgPipeline"]
     _import_structure["pipeline_qwenimage_inpaint"] = ["QwenImageInpaintPipeline"]
+    _import_structure["pipeline_qwenimage_layered"] = ["QwenImageLayeredPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -47,6 +48,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
         from .pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
         from .pipeline_qwenimage_img2img import QwenImageImg2ImgPipeline
         from .pipeline_qwenimage_inpaint import QwenImageInpaintPipeline
+        from .pipeline_qwenimage_layered import QwenImageLayeredPipeline
 else:
     import sys
 
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
new file mode 100644
index 0000000000..7bb12c26ba
--- /dev/null
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
@@ -0,0 +1,905 @@
+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import QwenImageLoraLoaderMixin
+from ...models import AutoencoderKLQwenImage, QwenImageTransformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import QwenImagePipelineOutput
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from PIL import Image
+        >>> from diffusers import QwenImageLayeredPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = QwenImageLayeredPipeline.from_pretrained("Qwen/Qwen-Image-Layered", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
+        ... ).convert("RGBA")
+        >>> prompt = ""
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> images = pipe(
+        ...     image,
+        ...     prompt,
+        ...     num_inference_steps=50,
+        ...     true_cfg_scale=4.0,
+        ...     layers=4,
+        ...     resolution=640,
+        ...     cfg_normalize=False,
+        ...     use_en_prompt=True,
+        ... ).images[0]
+        >>> for i, image in enumerate(images):
+        ...     image.save(f"{i}.out.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit_plus.calculate_dimensions
+def calculate_dimensions(target_area, ratio):
+    width = math.sqrt(target_area * ratio)
+    height = width / ratio
+
+    width = round(width / 32) * 32
+    height = round(height / 32) * 32
+
+    return width, height
+
+
+class QwenImageLayeredPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
+    r"""
+    The Qwen-Image-Layered pipeline for image decomposing.
+
+    Args:
+        transformer ([`QwenImageTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
+        tokenizer (`QwenTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKLQwenImage,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2Tokenizer,
+        processor: Qwen2VLProcessor,
+        transformer: QwenImageTransformer2DModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            processor=processor,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
+        # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.vl_processor = processor
+        self.tokenizer_max_length = 1024
+
+        self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.prompt_template_encode_start_idx = 34
+        self.image_caption_prompt_cn = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n# 图像标注器\n你是一个专业的图像标注器。请基于输入图像，撰写图注:\n1.
+使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。\n2. 通过加入以下内容，丰富图注细节：\n - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等\n -
+对象间的视觉关系：如空间关系、功能关系、动作关系、从属关系、比较关系、因果关系等\n - 环境细节：例如天气、光照、颜色、纹理、气氛等\n - 文字内容：识别图像中清晰可见的文字，不做翻译和解释，用引号在图注中强调\n3.
+保持真实性与准确性：\n - 不要使用笼统的描述\n -
+描述图像中所有可见的信息，但不要加入没有在图像中出现的内容\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n"""
+        self.image_caption_prompt_en = """<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n# Image Annotator\nYou are a professional
+image annotator. Please write an image caption based on the input image:\n1. Write the caption using natural,
+descriptive language without structured formats or rich text.\n2. Enrich caption details by including: \n - Object
+attributes, such as quantity, color, shape, size, material, state, position, actions, and so on\n - Vision Relations
+between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action
+relations, comparative relations, causal relations, and so on\n - Environmental details, such as weather, lighting,
+colors, textures, atmosphere, and so on\n - Identify the text clearly visible in the image, without translation or
+explanation, and highlight it in the caption with quotation marks\n3. Maintain authenticity and accuracy:\n - Avoid
+generalizations\n - Describe all visible information in the image, while do not add information not explicitly shown in
+the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>assistant\n"""
+        self.default_sample_size = 128
+
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
+    def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
+        bool_mask = mask.bool()
+        valid_lengths = bool_mask.sum(dim=1)
+        selected = hidden_states[bool_mask]
+        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
+
+        return split_result
+
+    def _get_qwen_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        template = self.prompt_template_encode
+        drop_idx = self.prompt_template_encode_start_idx
+        txt = [template.format(e) for e in prompt]
+        txt_tokens = self.tokenizer(
+            txt,
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        encoder_hidden_states = self.text_encoder(
+            input_ids=txt_tokens.input_ids,
+            attention_mask=txt_tokens.attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states = encoder_hidden_states.hidden_states[-1]
+        split_hidden_states = self._extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
+        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+        max_seq_len = max([e.size(0) for e in split_hidden_states])
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+        )
+        encoder_attention_mask = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+        )
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        return prompt_embeds, encoder_attention_mask
+
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 1024,
+    ):
+        r"""
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
+
+        prompt_embeds = prompt_embeds[:, :max_sequence_length]
+        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
+
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+        return prompt_embeds, prompt_embeds_mask
+
+    def get_image_caption(self, prompt_image, use_en_prompt=True, device=None):
+        if use_en_prompt:
+            prompt = self.image_caption_prompt_en
+        else:
+            prompt = self.image_caption_prompt_cn
+        model_inputs = self.vl_processor(
+            text=prompt,
+            images=prompt_image,
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        generated_ids = self.text_encoder.generate(**model_inputs, max_new_tokens=512)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        output_text = self.vl_processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return output_text.strip()
+
+    def check_inputs(
+        self,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_embeds_mask is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
+
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width, layers):
+        latents = latents.view(batch_size, layers, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 1, 3, 5, 2, 4, 6)
+        latents = latents.reshape(batch_size, layers * (height // 2) * (width // 2), num_channels_latents * 4)
+
+        return latents
+
+    @staticmethod
+    def _unpack_latents(latents, height, width, layers, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+
+        latents = latents.view(batch_size, layers + 1, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 1, 4, 2, 5, 3, 6)
+
+        latents = latents.reshape(batch_size, layers + 1, channels // (2 * 2), height, width)
+        latents = latents.permute(0, 2, 1, 3, 4)  # (b, c, f, h, w)
+
+        return latents
+
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean)
+            .view(1, self.latent_channels, 1, 1, 1)
+            .to(image_latents.device, image_latents.dtype)
+        )
+        latents_std = (
+            torch.tensor(self.vae.config.latents_std)
+            .view(1, self.latent_channels, 1, 1, 1)
+            .to(image_latents.device, image_latents.dtype)
+        )
+        image_latents = (image_latents - latents_mean) / latents_std
+
+        return image_latents
+
+    def prepare_latents(
+        self,
+        image,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        layers,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (
+            batch_size,
+            layers + 1,
+            num_channels_latents,
+            height,
+            width,
+        )  ### the generated first image is combined image
+
+        image_latents = None
+        if image is not None:
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] != self.latent_channels:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            else:
+                image_latents = image
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+
+            image_latent_height, image_latent_width = image_latents.shape[3:]
+            image_latents = image_latents.permute(0, 2, 1, 3, 4)  # (b, c, f, h, w) -> (b, f, c, h, w)
+            image_latents = self._pack_latents(
+                image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width, 1
+            )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width, layers + 1)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        return latents, image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        true_cfg_scale: float = 4.0,
+        layers: Optional[int] = 4,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: Optional[float] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        resolution: int = 640,
+        cfg_normalize: bool = False,
+        use_en_prompt: bool = False,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                true_cfg_scale (`float`, *optional*, defaults to 1.0): Guidance scale as defined in [Classifier-Free
+                Diffusion Guidance](https://huggingface.co/papers/2207.12598). `true_cfg_scale` is defined as `w` of
+                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Classifier-free guidance is
+                enabled by setting `true_cfg_scale > 1` and a provided `negative_prompt`. Higher guidance scale
+                encourages to generate images that are closely linked to the text `prompt`, usually at the expense of
+                lower image quality.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to None):
+                A guidance scale value for guidance distilled models. Unlike the traditional classifier-free guidance
+                where the guidance scale is applied during inference through noise prediction rescaling, guidance
+                distilled models take the guidance scale directly as an input parameter during forward pass. Guidance
+                scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images
+                that are closely linked to the text `prompt`, usually at the expense of lower image quality. This
+                parameter in the pipeline is there to support future guidance-distilled models when they come up. It is
+                ignored when not using guidance distilled models. To enable traditional classifier-free guidance,
+                please pass `true_cfg_scale > 1.0` and `negative_prompt` (even an empty negative prompt like " " should
+                enable classifier-free guidance computations).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+            resolution (`int`, *optional*, defaults to 640):
+                using different bucket in (640, 1024) to determin the condition and output resolution
+            cfg_normalize (`bool`, *optional*, defaults to `False`)
+                whether enable cfg normalization.
+            use_en_prompt (`bool`, *optional*, defaults to `False`)
+                automatic caption language if user does not provide caption
+
+        Examples:
+
+        Returns:
+            [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
+            [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+        image_size = image[0].size if isinstance(image, list) else image.size
+        assert resolution in [640, 1024], f"resolution must be either 640 or 1024, but got {resolution}"
+        calculated_width, calculated_height = calculate_dimensions(
+            resolution * resolution, image_size[0] / image_size[1]
+        )
+        height = calculated_height
+        width = calculated_width
+
+        multiple_of = self.vae_scale_factor * 2
+        width = width // multiple_of * multiple_of
+        height = height // multiple_of * multiple_of
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+        # 2. Preprocess image
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            image = self.image_processor.resize(image, calculated_height, calculated_width)
+            prompt_image = image
+            image = self.image_processor.preprocess(image, calculated_height, calculated_width)
+            image = image.unsqueeze(2)
+            image = image.to(dtype=self.text_encoder.dtype)
+
+        if prompt is None or prompt == "" or prompt == " ":
+            prompt = self.get_image_caption(prompt_image, use_en_prompt=use_en_prompt, device=device)
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
+        )
+
+        if true_cfg_scale > 1 and not has_neg_prompt:
+            logger.warning(
+                f"true_cfg_scale is passed as {true_cfg_scale}, but classifier-free guidance is not enabled since no negative_prompt is provided."
+            )
+        elif true_cfg_scale <= 1 and has_neg_prompt:
+            logger.warning(
+                " negative_prompt is passed but classifier-free guidance is not enabled since true_cfg_scale <= 1"
+            )
+
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_true_cfg:
+            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=negative_prompt_embeds_mask,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            layers,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        img_shapes = [
+            [
+                *[
+                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2)
+                    for _ in range(layers + 1)
+                ],
+                (1, calculated_height // self.vae_scale_factor // 2, calculated_width // self.vae_scale_factor // 2),
+            ]
+        ] * batch_size
+
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 0, num_inference_steps + 1)[:-1] if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        base_seqlen = 256 * 256 / 16 / 16
+        mu = (image_latents.shape[1] / base_seqlen) ** 0.5
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # handle guidance
+        if self.transformer.config.guidance_embeds and guidance_scale is None:
+            raise ValueError("guidance_scale is required for guidance-distilled model.")
+        elif self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        elif not self.transformer.config.guidance_embeds and guidance_scale is not None:
+            logger.warning(
+                f"guidance_scale is passed as {guidance_scale}, but ignored since the model is not guidance-distilled."
+            )
+            guidance = None
+        elif not self.transformer.config.guidance_embeds and guidance_scale is None:
+            guidance = None
+
+        if self.attention_kwargs is None:
+            self._attention_kwargs = {}
+
+        txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
+        negative_txt_seq_lens = (
+            negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
+        )
+        is_rgb = torch.tensor([0] * batch_size).to(device=device, dtype=torch.long)
+        # 6. Denoising loop
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t
+
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        encoder_hidden_states_mask=prompt_embeds_mask,
+                        encoder_hidden_states=prompt_embeds,
+                        img_shapes=img_shapes,
+                        txt_seq_lens=txt_seq_lens,
+                        attention_kwargs=self.attention_kwargs,
+                        additional_t_cond=is_rgb,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = noise_pred[:, : latents.size(1)]
+
+                if do_true_cfg:
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            img_shapes=img_shapes,
+                            txt_seq_lens=negative_txt_seq_lens,
+                            attention_kwargs=self.attention_kwargs,
+                            additional_t_cond=is_rgb,
+                            return_dict=False,
+                        )[0]
+                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+
+                    if cfg_normalize:
+                        cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                        noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                        noise_pred = comb_pred * (cond_norm / noise_norm)
+                    else:
+                        noise_pred = comb_pred
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, layers, self.vae_scale_factor)
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+
+            b, c, f, h, w = latents.shape
+
+            latents = latents[:, :, 1:]  # remove the first frame as it is the orgin input
+
+            latents = latents.permute(0, 2, 1, 3, 4).view(-1, c, 1, h, w)
+
+            image = self.vae.decode(latents, return_dict=False)[0]  # (b f) c 1 h w
+
+            image = image.squeeze(2)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+            images = []
+            for bidx in range(b):
+                images.append(image[bidx * f : (bidx + 1) * f])
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (images,)
+
+        return QwenImagePipelineOutput(images=images)
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index be7f8f8ce4..cd51d3a567 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -2297,6 +2297,21 @@ class QwenImageInpaintPipeline(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class QwenImageLayeredPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class QwenImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From 55463f7ace24f0506a38c3971291e63d50bc989d Mon Sep 17 00:00:00 2001
From: hlky <hlky@hlky.ac>
Date: Wed, 17 Dec 2025 19:44:20 +0000
Subject: [PATCH 08/11] Z-Image-Turbo ControlNet (#12792)

* init

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/diffusers/__init__.py                     |   6 +
 src/diffusers/loaders/single_file_model.py    |  10 +-
 src/diffusers/loaders/single_file_utils.py    |  24 +
 src/diffusers/models/__init__.py              |   2 +
 src/diffusers/models/controlnets/__init__.py  |   1 +
 .../models/controlnets/controlnet_z_image.py  | 824 ++++++++++++++++++
 .../transformers/transformer_z_image.py       |  13 +-
 src/diffusers/pipelines/__init__.py           |  14 +-
 src/diffusers/pipelines/z_image/__init__.py   |   4 +
 .../z_image/pipeline_z_image_controlnet.py    | 725 +++++++++++++++
 .../pipeline_z_image_controlnet_inpaint.py    | 747 ++++++++++++++++
 src/diffusers/utils/dummy_pt_objects.py       |  15 +
 .../dummy_torch_and_transformers_objects.py   |  30 +
 13 files changed, 2409 insertions(+), 6 deletions(-)
 create mode 100644 src/diffusers/models/controlnets/controlnet_z_image.py
 create mode 100644 src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py
 create mode 100644 src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index aec7efd1ff..03ecaf6bc1 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -279,6 +279,7 @@ else:
             "WanAnimateTransformer3DModel",
             "WanTransformer3DModel",
             "WanVACETransformer3DModel",
+            "ZImageControlNetModel",
             "ZImageTransformer2DModel",
             "attention_backend",
         ]
@@ -670,6 +671,8 @@ else:
             "WuerstchenCombinedPipeline",
             "WuerstchenDecoderPipeline",
             "WuerstchenPriorPipeline",
+            "ZImageControlNetInpaintPipeline",
+            "ZImageControlNetPipeline",
             "ZImageImg2ImgPipeline",
             "ZImagePipeline",
         ]
@@ -1017,6 +1020,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             WanAnimateTransformer3DModel,
             WanTransformer3DModel,
             WanVACETransformer3DModel,
+            ZImageControlNetModel,
             ZImageTransformer2DModel,
             attention_backend,
         )
@@ -1377,6 +1381,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             WuerstchenCombinedPipeline,
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
+            ZImageControlNetInpaintPipeline,
+            ZImageControlNetPipeline,
             ZImageImg2ImgPipeline,
             ZImagePipeline,
         )
diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index 803fdfc2d9..0e4ebab7fe 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -49,6 +49,7 @@ from .single_file_utils import (
     convert_stable_cascade_unet_single_file_to_diffusers,
     convert_wan_transformer_to_diffusers,
     convert_wan_vae_to_diffusers,
+    convert_z_image_controlnet_checkpoint_to_diffusers,
     convert_z_image_transformer_checkpoint_to_diffusers,
     create_controlnet_diffusers_config_from_ldm,
     create_unet_diffusers_config_from_ldm,
@@ -172,11 +173,18 @@ SINGLE_FILE_LOADABLE_CLASSES = {
         "checkpoint_mapping_fn": convert_z_image_transformer_checkpoint_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "ZImageControlNetModel": {
+        "checkpoint_mapping_fn": convert_z_image_controlnet_checkpoint_to_diffusers,
+    },
 }
 
 
 def _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint_state_dict):
-    return not set(model_state_dict.keys()).issubset(set(checkpoint_state_dict.keys()))
+    model_state_dict_keys = set(model_state_dict.keys())
+    checkpoint_state_dict_keys = set(checkpoint_state_dict.keys())
+    is_subset = model_state_dict_keys.issubset(checkpoint_state_dict_keys)
+    is_match = model_state_dict_keys == checkpoint_state_dict_keys
+    return not (is_subset and is_match)
 
 
 def _get_single_file_loadable_mapping_class(cls):
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index b866a5a21a..aac4835fe8 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -121,6 +121,8 @@ CHECKPOINT_KEY_NAMES = {
     "instruct-pix2pix": "model.diffusion_model.input_blocks.0.0.weight",
     "lumina2": ["model.diffusion_model.cap_embedder.0.weight", "cap_embedder.0.weight"],
     "z-image-turbo": "cap_embedder.0.weight",
+    "z-image-turbo-controlnet": "control_all_x_embedder.2-1.weight",
+    "z-image-turbo-controlnet-2.x": "control_layers.14.adaLN_modulation.0.weight",
     "sana": [
         "blocks.0.cross_attn.q_linear.weight",
         "blocks.0.cross_attn.q_linear.bias",
@@ -220,6 +222,8 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
     "cosmos-2.0-v2w-2B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-2B-Video2World"},
     "cosmos-2.0-v2w-14B": {"pretrained_model_name_or_path": "nvidia/Cosmos-Predict2-14B-Video2World"},
     "z-image-turbo": {"pretrained_model_name_or_path": "Tongyi-MAI/Z-Image-Turbo"},
+    "z-image-turbo-controlnet": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union"},
+    "z-image-turbo-controlnet-2.x": {"pretrained_model_name_or_path": "hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.1"},
 }
 
 # Use to configure model sample size when original config is provided
@@ -779,6 +783,12 @@ def infer_diffusers_model_type(checkpoint):
         else:
             raise ValueError(f"Unexpected x_embedder shape: {x_embedder_shape} when loading Cosmos 2.0 model.")
 
+    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet-2.x"] in checkpoint:
+        model_type = "z-image-turbo-controlnet-2.x"
+
+    elif CHECKPOINT_KEY_NAMES["z-image-turbo-controlnet"] in checkpoint:
+        model_type = "z-image-turbo-controlnet"
+
     else:
         model_type = "v1"
 
@@ -3885,3 +3895,17 @@ def convert_z_image_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
             handler_fn_inplace(key, converted_state_dict)
 
     return converted_state_dict
+
+
+def convert_z_image_controlnet_checkpoint_to_diffusers(checkpoint, config, **kwargs):
+    if config["add_control_noise_refiner"] is None:
+        return checkpoint
+    elif config["add_control_noise_refiner"] == "control_noise_refiner":
+        return checkpoint
+    elif config["add_control_noise_refiner"] == "control_layers":
+        converted_state_dict = {
+            key: checkpoint.pop(key) for key in list(checkpoint.keys()) if not key.startswith("control_noise_refiner.")
+        }
+        return converted_state_dict
+    else:
+        raise ValueError("Unknown Z-Image Turbo ControlNet type.")
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 4c1b397bdf..c4664f00ca 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -66,6 +66,7 @@ if is_torch_available():
     _import_structure["controlnets.controlnet_sparsectrl"] = ["SparseControlNetModel"]
     _import_structure["controlnets.controlnet_union"] = ["ControlNetUnionModel"]
     _import_structure["controlnets.controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
+    _import_structure["controlnets.controlnet_z_image"] = ["ZImageControlNetModel"]
     _import_structure["controlnets.multicontrolnet"] = ["MultiControlNetModel"]
     _import_structure["controlnets.multicontrolnet_union"] = ["MultiControlNetUnionModel"]
     _import_structure["embeddings"] = ["ImageProjection"]
@@ -181,6 +182,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             SD3MultiControlNetModel,
             SparseControlNetModel,
             UNetControlNetXSModel,
+            ZImageControlNetModel,
         )
         from .embeddings import ImageProjection
         from .modeling_utils import ModelMixin
diff --git a/src/diffusers/models/controlnets/__init__.py b/src/diffusers/models/controlnets/__init__.py
index 7ce352879d..fee7f231e8 100644
--- a/src/diffusers/models/controlnets/__init__.py
+++ b/src/diffusers/models/controlnets/__init__.py
@@ -19,6 +19,7 @@ if is_torch_available():
     )
     from .controlnet_union import ControlNetUnionModel
     from .controlnet_xs import ControlNetXSAdapter, ControlNetXSOutput, UNetControlNetXSModel
+    from .controlnet_z_image import ZImageControlNetModel
     from .multicontrolnet import MultiControlNetModel
     from .multicontrolnet_union import MultiControlNetUnionModel
 
diff --git a/src/diffusers/models/controlnets/controlnet_z_image.py b/src/diffusers/models/controlnets/controlnet_z_image.py
new file mode 100644
index 0000000000..54e398ea13
--- /dev/null
+++ b/src/diffusers/models/controlnets/controlnet_z_image.py
@@ -0,0 +1,824 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Literal, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ...loaders.single_file_model import FromOriginalModelMixin
+from ...models.attention_processor import Attention
+from ...models.normalization import RMSNorm
+from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention_dispatch import dispatch_attention_fn
+from ..controlnets.controlnet import zero_module
+from ..modeling_utils import ModelMixin
+
+
+ADALN_EMBED_DIM = 256
+SEQ_MULTI_OF = 32
+
+
+# Copied from diffusers.models.transformers.transformer_z_image.TimestepEmbedder
+class TimestepEmbedder(nn.Module):
+    def __init__(self, out_size, mid_size=None, frequency_embedding_size=256):
+        super().__init__()
+        if mid_size is None:
+            mid_size = out_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, mid_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(mid_size, out_size, bias=True),
+        )
+
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        with torch.amp.autocast("cuda", enabled=False):
+            half = dim // 2
+            freqs = torch.exp(
+                -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+            )
+            args = t[:, None].float() * freqs[None]
+            embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+            if dim % 2:
+                embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+            return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        weight_dtype = self.mlp[0].weight.dtype
+        compute_dtype = getattr(self.mlp[0], "compute_dtype", None)
+        if weight_dtype.is_floating_point:
+            t_freq = t_freq.to(weight_dtype)
+        elif compute_dtype is not None:
+            t_freq = t_freq.to(compute_dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+# Copied from diffusers.models.transformers.transformer_z_image.ZSingleStreamAttnProcessor
+class ZSingleStreamAttnProcessor:
+    """
+    Processor for Z-Image single stream attention that adapts the existing Attention class to match the behavior of the
+    original Z-ImageAttention module.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "ZSingleStreamAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to version 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        freqs_cis: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+
+        # Apply Norms
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE
+        def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+            with torch.amp.autocast("cuda", enabled=False):
+                x = torch.view_as_complex(x_in.float().reshape(*x_in.shape[:-1], -1, 2))
+                freqs_cis = freqs_cis.unsqueeze(2)
+                x_out = torch.view_as_real(x * freqs_cis).flatten(3)
+                return x_out.type_as(x_in)  # todo
+
+        if freqs_cis is not None:
+            query = apply_rotary_emb(query, freqs_cis)
+            key = apply_rotary_emb(key, freqs_cis)
+
+        # Cast to correct dtype
+        dtype = query.dtype
+        query, key = query.to(dtype), key.to(dtype)
+
+        # From [batch, seq_len] to [batch, 1, 1, seq_len] -> broadcast to [batch, heads, seq_len, seq_len]
+        if attention_mask is not None and attention_mask.ndim == 2:
+            attention_mask = attention_mask[:, None, None, :]
+
+        # Compute joint attention
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+
+        # Reshape back
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(dtype)
+
+        output = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:  # dropout
+            output = attn.to_out[1](output)
+
+        return output
+
+
+# Copied from diffusers.models.transformers.transformer_z_image.FeedForward
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def _forward_silu_gating(self, x1, x3):
+        return F.silu(x1) * x3
+
+    def forward(self, x):
+        return self.w2(self._forward_silu_gating(self.w1(x), self.w3(x)))
+
+
+@maybe_allow_in_graph
+# Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformerBlock
+class ZImageTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+
+        # Refactored to use diffusers Attention with custom processor
+        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
+        self.attention = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // n_heads,
+            heads=n_heads,
+            qk_norm="rms_norm" if qk_norm else None,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=ZSingleStreamAttnProcessor(),
+        )
+
+        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
+        self.layer_id = layer_id
+
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
+            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
+
+            # Attention block
+            attn_out = self.attention(
+                self.attention_norm1(x) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
+            )
+            x = x + gate_msa * self.attention_norm2(attn_out)
+
+            # FFN block
+            x = x + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(x) * scale_mlp))
+        else:
+            # Attention block
+            attn_out = self.attention(self.attention_norm1(x), attention_mask=attn_mask, freqs_cis=freqs_cis)
+            x = x + self.attention_norm2(attn_out)
+
+            # FFN block
+            x = x + self.ffn_norm2(self.feed_forward(self.ffn_norm1(x)))
+
+        return x
+
+
+# Copied from diffusers.models.transformers.transformer_z_image.RopeEmbedder
+class RopeEmbedder:
+    def __init__(
+        self,
+        theta: float = 256.0,
+        axes_dims: List[int] = (16, 56, 56),
+        axes_lens: List[int] = (64, 128, 128),
+    ):
+        self.theta = theta
+        self.axes_dims = axes_dims
+        self.axes_lens = axes_lens
+        assert len(axes_dims) == len(axes_lens), "axes_dims and axes_lens must have the same length"
+        self.freqs_cis = None
+
+    @staticmethod
+    def precompute_freqs_cis(dim: List[int], end: List[int], theta: float = 256.0):
+        with torch.device("cpu"):
+            freqs_cis = []
+            for i, (d, e) in enumerate(zip(dim, end)):
+                freqs = 1.0 / (theta ** (torch.arange(0, d, 2, dtype=torch.float64, device="cpu") / d))
+                timestep = torch.arange(e, device=freqs.device, dtype=torch.float64)
+                freqs = torch.outer(timestep, freqs).float()
+                freqs_cis_i = torch.polar(torch.ones_like(freqs), freqs).to(torch.complex64)  # complex64
+                freqs_cis.append(freqs_cis_i)
+
+            return freqs_cis
+
+    def __call__(self, ids: torch.Tensor):
+        assert ids.ndim == 2
+        assert ids.shape[-1] == len(self.axes_dims)
+        device = ids.device
+
+        if self.freqs_cis is None:
+            self.freqs_cis = self.precompute_freqs_cis(self.axes_dims, self.axes_lens, theta=self.theta)
+            self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
+        else:
+            # Ensure freqs_cis are on the same device as ids
+            if self.freqs_cis[0].device != device:
+                self.freqs_cis = [freqs_cis.to(device) for freqs_cis in self.freqs_cis]
+
+        result = []
+        for i in range(len(self.axes_dims)):
+            index = ids[:, i]
+            result.append(self.freqs_cis[i][index])
+        return torch.cat(result, dim=-1)
+
+
+@maybe_allow_in_graph
+class ZImageControlTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+        qk_norm: bool,
+        modulation=True,
+        block_id=0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = dim // n_heads
+
+        # Refactored to use diffusers Attention with custom processor
+        # Original Z-Image params: dim, n_heads, n_kv_heads, qk_norm
+        self.attention = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=dim // n_heads,
+            heads=n_heads,
+            qk_norm="rms_norm" if qk_norm else None,
+            eps=1e-5,
+            bias=False,
+            out_bias=False,
+            processor=ZSingleStreamAttnProcessor(),
+        )
+
+        self.feed_forward = FeedForward(dim=dim, hidden_dim=int(dim / 3 * 8))
+        self.layer_id = layer_id
+
+        self.attention_norm1 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm1 = RMSNorm(dim, eps=norm_eps)
+
+        self.attention_norm2 = RMSNorm(dim, eps=norm_eps)
+        self.ffn_norm2 = RMSNorm(dim, eps=norm_eps)
+
+        self.modulation = modulation
+        if modulation:
+            self.adaLN_modulation = nn.Sequential(nn.Linear(min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True))
+
+        # Control variant start
+        self.block_id = block_id
+        if block_id == 0:
+            self.before_proj = zero_module(nn.Linear(self.dim, self.dim))
+        self.after_proj = zero_module(nn.Linear(self.dim, self.dim))
+
+    def forward(
+        self,
+        c: torch.Tensor,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+        adaln_input: Optional[torch.Tensor] = None,
+    ):
+        # Control
+        if self.block_id == 0:
+            c = self.before_proj(c) + x
+            all_c = []
+        else:
+            all_c = list(torch.unbind(c))
+            c = all_c.pop(-1)
+
+        # Compared to `ZImageTransformerBlock` x -> c
+        if self.modulation:
+            assert adaln_input is not None
+            scale_msa, gate_msa, scale_mlp, gate_mlp = self.adaLN_modulation(adaln_input).unsqueeze(1).chunk(4, dim=2)
+            gate_msa, gate_mlp = gate_msa.tanh(), gate_mlp.tanh()
+            scale_msa, scale_mlp = 1.0 + scale_msa, 1.0 + scale_mlp
+
+            # Attention block
+            attn_out = self.attention(
+                self.attention_norm1(c) * scale_msa, attention_mask=attn_mask, freqs_cis=freqs_cis
+            )
+            c = c + gate_msa * self.attention_norm2(attn_out)
+
+            # FFN block
+            c = c + gate_mlp * self.ffn_norm2(self.feed_forward(self.ffn_norm1(c) * scale_mlp))
+        else:
+            # Attention block
+            attn_out = self.attention(self.attention_norm1(c), attention_mask=attn_mask, freqs_cis=freqs_cis)
+            c = c + self.attention_norm2(attn_out)
+
+            # FFN block
+            c = c + self.ffn_norm2(self.feed_forward(self.ffn_norm1(c)))
+
+        # Control
+        c_skip = self.after_proj(c)
+        all_c += [c_skip, c]
+        c = torch.stack(all_c)
+        return c
+
+
+class ZImageControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        control_layers_places: List[int] = None,
+        control_refiner_layers_places: List[int] = None,
+        control_in_dim=None,
+        add_control_noise_refiner: Optional[Literal["control_layers", "control_noise_refiner"]] = None,
+        all_patch_size=(2,),
+        all_f_patch_size=(1,),
+        dim=3840,
+        n_refiner_layers=2,
+        n_heads=30,
+        n_kv_heads=30,
+        norm_eps=1e-5,
+        qk_norm=True,
+    ):
+        super().__init__()
+        self.control_layers_places = control_layers_places
+        self.control_in_dim = control_in_dim
+        self.control_refiner_layers_places = control_refiner_layers_places
+        self.add_control_noise_refiner = add_control_noise_refiner
+
+        assert 0 in self.control_layers_places
+
+        # control blocks
+        self.control_layers = nn.ModuleList(
+            [
+                ZImageControlTransformerBlock(i, dim, n_heads, n_kv_heads, norm_eps, qk_norm, block_id=i)
+                for i in self.control_layers_places
+            ]
+        )
+
+        # control patch embeddings
+        all_x_embedder = {}
+        for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)):
+            x_embedder = nn.Linear(f_patch_size * patch_size * patch_size * self.control_in_dim, dim, bias=True)
+            all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder
+
+        self.control_all_x_embedder = nn.ModuleDict(all_x_embedder)
+        if self.add_control_noise_refiner == "control_layers":
+            self.control_noise_refiner = None
+        elif self.add_control_noise_refiner == "control_noise_refiner":
+            self.control_noise_refiner = nn.ModuleList(
+                [
+                    ZImageControlTransformerBlock(
+                        1000 + layer_id,
+                        dim,
+                        n_heads,
+                        n_kv_heads,
+                        norm_eps,
+                        qk_norm,
+                        modulation=True,
+                        block_id=layer_id,
+                    )
+                    for layer_id in range(n_refiner_layers)
+                ]
+            )
+        else:
+            self.control_noise_refiner = nn.ModuleList(
+                [
+                    ZImageTransformerBlock(
+                        1000 + layer_id,
+                        dim,
+                        n_heads,
+                        n_kv_heads,
+                        norm_eps,
+                        qk_norm,
+                        modulation=True,
+                    )
+                    for layer_id in range(n_refiner_layers)
+                ]
+            )
+
+        self.t_scale: Optional[float] = None
+        self.t_embedder: Optional[TimestepEmbedder] = None
+        self.all_x_embedder: Optional[nn.ModuleDict] = None
+        self.cap_embedder: Optional[nn.Sequential] = None
+        self.rope_embedder: Optional[RopeEmbedder] = None
+        self.noise_refiner: Optional[nn.ModuleList] = None
+        self.context_refiner: Optional[nn.ModuleList] = None
+        self.x_pad_token: Optional[nn.Parameter] = None
+        self.cap_pad_token: Optional[nn.Parameter] = None
+
+    @classmethod
+    def from_transformer(cls, controlnet, transformer):
+        controlnet.t_scale = transformer.t_scale
+        controlnet.t_embedder = transformer.t_embedder
+        controlnet.all_x_embedder = transformer.all_x_embedder
+        controlnet.cap_embedder = transformer.cap_embedder
+        controlnet.rope_embedder = transformer.rope_embedder
+        controlnet.noise_refiner = transformer.noise_refiner
+        controlnet.context_refiner = transformer.context_refiner
+        controlnet.x_pad_token = transformer.x_pad_token
+        controlnet.cap_pad_token = transformer.cap_pad_token
+        return controlnet
+
+    @staticmethod
+    # Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformer2DModel.create_coordinate_grid
+    def create_coordinate_grid(size, start=None, device=None):
+        if start is None:
+            start = (0 for _ in size)
+
+        axes = [torch.arange(x0, x0 + span, dtype=torch.int32, device=device) for x0, span in zip(start, size)]
+        grids = torch.meshgrid(axes, indexing="ij")
+        return torch.stack(grids, dim=-1)
+
+    # Copied from diffusers.models.transformers.transformer_z_image.ZImageTransformer2DModel.patchify_and_embed
+    def patchify_and_embed(
+        self,
+        all_image: List[torch.Tensor],
+        all_cap_feats: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        pH = pW = patch_size
+        pF = f_patch_size
+        device = all_image[0].device
+
+        all_image_out = []
+        all_image_size = []
+        all_image_pos_ids = []
+        all_image_pad_mask = []
+        all_cap_pos_ids = []
+        all_cap_pad_mask = []
+        all_cap_feats_out = []
+
+        for i, (image, cap_feat) in enumerate(zip(all_image, all_cap_feats)):
+            ### Process Caption
+            cap_ori_len = len(cap_feat)
+            cap_padding_len = (-cap_ori_len) % SEQ_MULTI_OF
+            # padded position ids
+            cap_padded_pos_ids = self.create_coordinate_grid(
+                size=(cap_ori_len + cap_padding_len, 1, 1),
+                start=(1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            all_cap_pos_ids.append(cap_padded_pos_ids)
+            # pad mask
+            cap_pad_mask = torch.cat(
+                [
+                    torch.zeros((cap_ori_len,), dtype=torch.bool, device=device),
+                    torch.ones((cap_padding_len,), dtype=torch.bool, device=device),
+                ],
+                dim=0,
+            )
+            all_cap_pad_mask.append(
+                cap_pad_mask if cap_padding_len > 0 else torch.zeros((cap_ori_len,), dtype=torch.bool, device=device)
+            )
+
+            # padded feature
+            cap_padded_feat = torch.cat([cap_feat, cap_feat[-1:].repeat(cap_padding_len, 1)], dim=0)
+            all_cap_feats_out.append(cap_padded_feat)
+
+            ### Process Image
+            C, F, H, W = image.size()
+            all_image_size.append((F, H, W))
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+
+            image_ori_pos_ids = self.create_coordinate_grid(
+                size=(F_tokens, H_tokens, W_tokens),
+                start=(cap_ori_len + cap_padding_len + 1, 0, 0),
+                device=device,
+            ).flatten(0, 2)
+            image_padded_pos_ids = torch.cat(
+                [
+                    image_ori_pos_ids,
+                    self.create_coordinate_grid(size=(1, 1, 1), start=(0, 0, 0), device=device)
+                    .flatten(0, 2)
+                    .repeat(image_padding_len, 1),
+                ],
+                dim=0,
+            )
+            all_image_pos_ids.append(image_padded_pos_ids if image_padding_len > 0 else image_ori_pos_ids)
+            # pad mask
+            image_pad_mask = torch.cat(
+                [
+                    torch.zeros((image_ori_len,), dtype=torch.bool, device=device),
+                    torch.ones((image_padding_len,), dtype=torch.bool, device=device),
+                ],
+                dim=0,
+            )
+            all_image_pad_mask.append(
+                image_pad_mask
+                if image_padding_len > 0
+                else torch.zeros((image_ori_len,), dtype=torch.bool, device=device)
+            )
+            # padded feature
+            image_padded_feat = torch.cat(
+                [image, image[-1:].repeat(image_padding_len, 1)],
+                dim=0,
+            )
+            all_image_out.append(image_padded_feat if image_padding_len > 0 else image)
+
+        return (
+            all_image_out,
+            all_cap_feats_out,
+            all_image_size,
+            all_image_pos_ids,
+            all_cap_pos_ids,
+            all_image_pad_mask,
+            all_cap_pad_mask,
+        )
+
+    def patchify(
+        self,
+        all_image: List[torch.Tensor],
+        patch_size: int,
+        f_patch_size: int,
+    ):
+        pH = pW = patch_size
+        pF = f_patch_size
+        all_image_out = []
+
+        for i, image in enumerate(all_image):
+            ### Process Image
+            C, F, H, W = image.size()
+            F_tokens, H_tokens, W_tokens = F // pF, H // pH, W // pW
+
+            image = image.view(C, F_tokens, pF, H_tokens, pH, W_tokens, pW)
+            # "c f pf h ph w pw -> (f h w) (pf ph pw c)"
+            image = image.permute(1, 3, 5, 2, 4, 6, 0).reshape(F_tokens * H_tokens * W_tokens, pF * pH * pW * C)
+
+            image_ori_len = len(image)
+            image_padding_len = (-image_ori_len) % SEQ_MULTI_OF
+
+            # padded feature
+            image_padded_feat = torch.cat([image, image[-1:].repeat(image_padding_len, 1)], dim=0)
+            all_image_out.append(image_padded_feat)
+
+        return all_image_out
+
+    def forward(
+        self,
+        x: List[torch.Tensor],
+        t,
+        cap_feats: List[torch.Tensor],
+        control_context: List[torch.Tensor],
+        conditioning_scale: float = 1.0,
+        patch_size=2,
+        f_patch_size=1,
+    ):
+        if (
+            self.t_scale is None
+            or self.t_embedder is None
+            or self.all_x_embedder is None
+            or self.cap_embedder is None
+            or self.rope_embedder is None
+            or self.noise_refiner is None
+            or self.context_refiner is None
+            or self.x_pad_token is None
+            or self.cap_pad_token is None
+        ):
+            raise ValueError(
+                "Required modules are `None`, use `from_transformer` to share required modules from `transformer`."
+            )
+
+        assert patch_size in self.config.all_patch_size
+        assert f_patch_size in self.config.all_f_patch_size
+
+        bsz = len(x)
+        device = x[0].device
+        t = t * self.t_scale
+        t = self.t_embedder(t)
+
+        (
+            x,
+            cap_feats,
+            x_size,
+            x_pos_ids,
+            cap_pos_ids,
+            x_inner_pad_mask,
+            cap_inner_pad_mask,
+        ) = self.patchify_and_embed(x, cap_feats, patch_size, f_patch_size)
+
+        x_item_seqlens = [len(_) for _ in x]
+        assert all(_ % SEQ_MULTI_OF == 0 for _ in x_item_seqlens)
+        x_max_item_seqlen = max(x_item_seqlens)
+
+        control_context = self.patchify(control_context, patch_size, f_patch_size)
+        control_context = torch.cat(control_context, dim=0)
+        control_context = self.control_all_x_embedder[f"{patch_size}-{f_patch_size}"](control_context)
+
+        control_context[torch.cat(x_inner_pad_mask)] = self.x_pad_token
+        control_context = list(control_context.split(x_item_seqlens, dim=0))
+
+        control_context = pad_sequence(control_context, batch_first=True, padding_value=0.0)
+
+        # x embed & refine
+        x = torch.cat(x, dim=0)
+        x = self.all_x_embedder[f"{patch_size}-{f_patch_size}"](x)
+
+        # Match t_embedder output dtype to x for layerwise casting compatibility
+        adaln_input = t.type_as(x)
+        x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
+        x = list(x.split(x_item_seqlens, dim=0))
+        x_freqs_cis = list(self.rope_embedder(torch.cat(x_pos_ids, dim=0)).split([len(_) for _ in x_pos_ids], dim=0))
+
+        x = pad_sequence(x, batch_first=True, padding_value=0.0)
+        x_freqs_cis = pad_sequence(x_freqs_cis, batch_first=True, padding_value=0.0)
+        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
+        x_freqs_cis = x_freqs_cis[:, : x.shape[1]]
+
+        x_attn_mask = torch.zeros((bsz, x_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(x_item_seqlens):
+            x_attn_mask[i, :seq_len] = 1
+
+        if self.add_control_noise_refiner is not None:
+            if self.add_control_noise_refiner == "control_layers":
+                layers = self.control_layers
+            elif self.add_control_noise_refiner == "control_noise_refiner":
+                layers = self.control_noise_refiner
+            else:
+                raise ValueError(f"Unsupported `add_control_noise_refiner` type: {self.add_control_noise_refiner}.")
+            for layer in layers:
+                if torch.is_grad_enabled() and self.gradient_checkpointing:
+                    control_context = self._gradient_checkpointing_func(
+                        layer, control_context, x, x_attn_mask, x_freqs_cis, adaln_input
+                    )
+                else:
+                    control_context = layer(control_context, x, x_attn_mask, x_freqs_cis, adaln_input)
+
+            hints = torch.unbind(control_context)[:-1]
+            control_context = torch.unbind(control_context)[-1]
+            noise_refiner_block_samples = {
+                layer_idx: hints[idx] * conditioning_scale
+                for idx, layer_idx in enumerate(self.control_refiner_layers_places)
+            }
+        else:
+            noise_refiner_block_samples = None
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for layer_idx, layer in enumerate(self.noise_refiner):
+                x = self._gradient_checkpointing_func(layer, x, x_attn_mask, x_freqs_cis, adaln_input)
+                if noise_refiner_block_samples is not None:
+                    if layer_idx in noise_refiner_block_samples:
+                        x = x + noise_refiner_block_samples[layer_idx]
+        else:
+            for layer_idx, layer in enumerate(self.noise_refiner):
+                x = layer(x, x_attn_mask, x_freqs_cis, adaln_input)
+                if noise_refiner_block_samples is not None:
+                    if layer_idx in noise_refiner_block_samples:
+                        x = x + noise_refiner_block_samples[layer_idx]
+
+        # cap embed & refine
+        cap_item_seqlens = [len(_) for _ in cap_feats]
+        cap_max_item_seqlen = max(cap_item_seqlens)
+
+        cap_feats = torch.cat(cap_feats, dim=0)
+        cap_feats = self.cap_embedder(cap_feats)
+        cap_feats[torch.cat(cap_inner_pad_mask)] = self.cap_pad_token
+        cap_feats = list(cap_feats.split(cap_item_seqlens, dim=0))
+        cap_freqs_cis = list(
+            self.rope_embedder(torch.cat(cap_pos_ids, dim=0)).split([len(_) for _ in cap_pos_ids], dim=0)
+        )
+
+        cap_feats = pad_sequence(cap_feats, batch_first=True, padding_value=0.0)
+        cap_freqs_cis = pad_sequence(cap_freqs_cis, batch_first=True, padding_value=0.0)
+        # Clarify the length matches to satisfy Dynamo due to "Symbolic Shape Inference" to avoid compilation errors
+        cap_freqs_cis = cap_freqs_cis[:, : cap_feats.shape[1]]
+
+        cap_attn_mask = torch.zeros((bsz, cap_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(cap_item_seqlens):
+            cap_attn_mask[i, :seq_len] = 1
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for layer in self.context_refiner:
+                cap_feats = self._gradient_checkpointing_func(layer, cap_feats, cap_attn_mask, cap_freqs_cis)
+        else:
+            for layer in self.context_refiner:
+                cap_feats = layer(cap_feats, cap_attn_mask, cap_freqs_cis)
+
+        # unified
+        unified = []
+        unified_freqs_cis = []
+        for i in range(bsz):
+            x_len = x_item_seqlens[i]
+            cap_len = cap_item_seqlens[i]
+            unified.append(torch.cat([x[i][:x_len], cap_feats[i][:cap_len]]))
+            unified_freqs_cis.append(torch.cat([x_freqs_cis[i][:x_len], cap_freqs_cis[i][:cap_len]]))
+        unified_item_seqlens = [a + b for a, b in zip(cap_item_seqlens, x_item_seqlens)]
+        assert unified_item_seqlens == [len(_) for _ in unified]
+        unified_max_item_seqlen = max(unified_item_seqlens)
+
+        unified = pad_sequence(unified, batch_first=True, padding_value=0.0)
+        unified_freqs_cis = pad_sequence(unified_freqs_cis, batch_first=True, padding_value=0.0)
+        unified_attn_mask = torch.zeros((bsz, unified_max_item_seqlen), dtype=torch.bool, device=device)
+        for i, seq_len in enumerate(unified_item_seqlens):
+            unified_attn_mask[i, :seq_len] = 1
+
+        ## ControlNet start
+        if not self.add_control_noise_refiner:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                for layer in self.control_noise_refiner:
+                    control_context = self._gradient_checkpointing_func(
+                        layer, control_context, x_attn_mask, x_freqs_cis, adaln_input
+                    )
+            else:
+                for layer in self.control_noise_refiner:
+                    control_context = layer(control_context, x_attn_mask, x_freqs_cis, adaln_input)
+
+        # unified
+        control_context_unified = []
+        for i in range(bsz):
+            x_len = x_item_seqlens[i]
+            cap_len = cap_item_seqlens[i]
+            control_context_unified.append(torch.cat([control_context[i][:x_len], cap_feats[i][:cap_len]]))
+        control_context_unified = pad_sequence(control_context_unified, batch_first=True, padding_value=0.0)
+
+        for layer in self.control_layers:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                control_context_unified = self._gradient_checkpointing_func(
+                    layer, control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
+                )
+            else:
+                control_context_unified = layer(
+                    control_context_unified, unified, unified_attn_mask, unified_freqs_cis, adaln_input
+                )
+
+        hints = torch.unbind(control_context_unified)[:-1]
+        controlnet_block_samples = {
+            layer_idx: hints[idx] * conditioning_scale for idx, layer_idx in enumerate(self.control_layers_places)
+        }
+        return controlnet_block_samples
diff --git a/src/diffusers/models/transformers/transformer_z_image.py b/src/diffusers/models/transformers/transformer_z_image.py
index 5c401b9d20..17197db3a4 100644
--- a/src/diffusers/models/transformers/transformer_z_image.py
+++ b/src/diffusers/models/transformers/transformer_z_image.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import math
-from typing import List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
@@ -536,6 +536,7 @@ class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOr
         x: List[torch.Tensor],
         t,
         cap_feats: List[torch.Tensor],
+        controlnet_block_samples: Optional[Dict[int, torch.Tensor]] = None,
         patch_size=2,
         f_patch_size=1,
         return_dict: bool = True,
@@ -635,13 +636,19 @@ class ZImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOr
             unified_attn_mask[i, :seq_len] = 1
 
         if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for layer in self.layers:
+            for layer_idx, layer in enumerate(self.layers):
                 unified = self._gradient_checkpointing_func(
                     layer, unified, unified_attn_mask, unified_freqs_cis, adaln_input
                 )
+                if controlnet_block_samples is not None:
+                    if layer_idx in controlnet_block_samples:
+                        unified = unified + controlnet_block_samples[layer_idx]
         else:
-            for layer in self.layers:
+            for layer_idx, layer in enumerate(self.layers):
                 unified = layer(unified, unified_attn_mask, unified_freqs_cis, adaln_input)
+                if controlnet_block_samples is not None:
+                    if layer_idx in controlnet_block_samples:
+                        unified = unified + controlnet_block_samples[layer_idx]
 
         unified = self.all_final_layer[f"{patch_size}-{f_patch_size}"](unified, adaln_input)
         unified = list(unified.unbind(dim=0))
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index a2a374906b..04ec6b5cd8 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -405,7 +405,12 @@ else:
         "Kandinsky5T2IPipeline",
         "Kandinsky5I2IPipeline",
     ]
-    _import_structure["z_image"] = ["ZImageImg2ImgPipeline", "ZImagePipeline"]
+    _import_structure["z_image"] = [
+        "ZImageImg2ImgPipeline",
+        "ZImagePipeline",
+        "ZImageControlNetPipeline",
+        "ZImageControlNetInpaintPipeline",
+    ]
     _import_structure["skyreels_v2"] = [
         "SkyReelsV2DiffusionForcingPipeline",
         "SkyReelsV2DiffusionForcingImageToVideoPipeline",
@@ -845,7 +850,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             WuerstchenDecoderPipeline,
             WuerstchenPriorPipeline,
         )
-        from .z_image import ZImageImg2ImgPipeline, ZImagePipeline
+        from .z_image import (
+            ZImageControlNetInpaintPipeline,
+            ZImageControlNetPipeline,
+            ZImageImg2ImgPipeline,
+            ZImagePipeline,
+        )
 
         try:
             if not is_onnx_available():
diff --git a/src/diffusers/pipelines/z_image/__init__.py b/src/diffusers/pipelines/z_image/__init__.py
index f4342713e3..7b3cfbceea 100644
--- a/src/diffusers/pipelines/z_image/__init__.py
+++ b/src/diffusers/pipelines/z_image/__init__.py
@@ -23,6 +23,8 @@ except OptionalDependencyNotAvailable:
 else:
     _import_structure["pipeline_output"] = ["ZImagePipelineOutput"]
     _import_structure["pipeline_z_image"] = ["ZImagePipeline"]
+    _import_structure["pipeline_z_image_controlnet"] = ["ZImageControlNetPipeline"]
+    _import_structure["pipeline_z_image_controlnet_inpaint"] = ["ZImageControlNetInpaintPipeline"]
     _import_structure["pipeline_z_image_img2img"] = ["ZImageImg2ImgPipeline"]
 
 
@@ -36,6 +38,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     else:
         from .pipeline_output import ZImagePipelineOutput
         from .pipeline_z_image import ZImagePipeline
+        from .pipeline_z_image_controlnet import ZImageControlNetPipeline
+        from .pipeline_z_image_controlnet_inpaint import ZImageControlNetInpaintPipeline
         from .pipeline_z_image_img2img import ZImageImg2ImgPipeline
 
 else:
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py
new file mode 100644
index 0000000000..5e26862b01
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet.py
@@ -0,0 +1,725 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import AutoTokenizer, PreTrainedModel
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnets import ZImageControlNetModel
+from ...models.transformers import ZImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import ZImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import ZImageControlNetPipeline
+        >>> from diffusers import ZImageControlNetModel
+        >>> from diffusers.utils import load_image
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> controlnet = ZImageControlNetModel.from_single_file(
+        ...     hf_hub_download(
+        ...         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union",
+        ...         filename="Z-Image-Turbo-Fun-Controlnet-Union.safetensors",
+        ...     ),
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+
+        >>> # 2.1
+        >>> # controlnet = ZImageControlNetModel.from_single_file(
+        >>> #     hf_hub_download(
+        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.1.safetensors",
+        >>> #     ),
+        >>> #     torch_dtype=torch.bfloat16,
+        >>> # )
+
+        >>> # 2.0 - `config` is required
+        >>> # controlnet = ZImageControlNetModel.from_single_file(
+        >>> #     hf_hub_download(
+        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.0.safetensors",
+        >>> #     ),
+        >>> #     torch_dtype=torch.bfloat16,
+        >>> #     config="hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        >>> # )
+
+        >>> pipe = ZImageControlNetPipeline.from_pretrained(
+        ...     "Tongyi-MAI/Z-Image-Turbo", controlnet=controlnet, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> # Optionally, set the attention backend to flash-attn 2 or 3, default is SDPA in PyTorch.
+        >>> # (1) Use flash attention 2
+        >>> # pipe.transformer.set_attention_backend("flash")
+        >>> # (2) Use flash attention 3
+        >>> # pipe.transformer.set_attention_backend("_flash_3")
+
+        >>> control_image = load_image(
+        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union/resolve/main/asset/pose.jpg?download=true"
+        ... )
+        >>> prompt = "一位年轻女子站在阳光明媚的海岸线上，白裙在轻拂的海风中微微飘动。她拥有一头鲜艳的紫色长发，在风中轻盈舞动，发间系着一个精致的黑色蝴蝶结，与身后柔和的蔚蓝天空形成鲜明对比。她面容清秀，眉目精致，透着一股甜美的青春气息；神情柔和，略带羞涩，目光静静地凝望着远方的地平线，双手自然交叠于身前，仿佛沉浸在思绪之中。在她身后，是辽阔无垠、波光粼粼的大海，阳光洒在海面上，映出温暖的金色光晕。"
+        >>> image = pipe(
+        ...     prompt,
+        ...     control_image=control_image,
+        ...     controlnet_conditioning_scale=0.75,
+        ...     height=1728,
+        ...     width=992,
+        ...     num_inference_steps=9,
+        ...     guidance_scale=0.0,
+        ...     generator=torch.Generator("cuda").manual_seed(43),
+        ... ).images[0]
+        >>> image.save("zimage.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ZImageControlNetPipeline(DiffusionPipeline, FromSingleFileMixin):
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageTransformer2DModel,
+        controlnet: ZImageControlNetModel,
+    ):
+        super().__init__()
+        controlnet = ZImageControlNetModel.from_transformer(controlnet, transformer)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt_embeds = self._encode_prompt(
+            prompt=prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                negative_prompt = ["" for _ in prompt]
+            else:
+                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            assert len(prompt) == len(negative_prompt)
+            negative_prompt_embeds = self._encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                prompt_embeds=negative_prompt_embeds,
+                max_sequence_length=max_sequence_length,
+            )
+        else:
+            negative_prompt_embeds = []
+        return prompt_embeds, negative_prompt_embeds
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        max_sequence_length: int = 512,
+    ) -> List[torch.FloatTensor]:
+        device = device or self._execution_device
+
+        if prompt_embeds is not None:
+            return prompt_embeds
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        for i, prompt_item in enumerate(prompt):
+            messages = [
+                {"role": "user", "content": prompt_item},
+            ]
+            prompt_item = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True,
+            )
+            prompt[i] = prompt_item
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_masks = text_inputs.attention_mask.to(device).bool()
+
+        prompt_embeds = self.text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_masks,
+            output_hidden_states=True,
+        ).hidden_states[-2]
+
+        embeddings_list = []
+
+        for i in range(len(prompt_embeds)):
+            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
+
+        return embeddings_list
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 5.0,
+        control_image: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.75,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply configuration normalization.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                The truncation value for configuration.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
+                tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        height = height or 1024
+        width = width or 1024
+
+        vae_scale = self.vae_scale_factor * 2
+        if height % vae_scale != 0:
+            raise ValueError(
+                f"Height must be divisible by {vae_scale} (got {height}). "
+                f"Please adjust the height to a multiple of {vae_scale}."
+            )
+        if width % vae_scale != 0:
+            raise ValueError(
+                f"Width must be divisible by {vae_scale} (got {width}). "
+                f"Please adjust the width to a multiple of {vae_scale}."
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = len(prompt_embeds)
+
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, "
+                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.in_channels
+
+        control_image = self.prepare_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.vae.dtype,
+        )
+        height, width = control_image.shape[-2:]
+        control_image = retrieve_latents(self.vae.encode(control_image), generator=generator, sample_mode="argmax")
+        control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        control_image = control_image.unsqueeze(2)
+
+        if num_channels_latents != self.controlnet.config.control_in_dim:
+            # For model version 2.0
+            control_image = torch.cat(
+                [
+                    control_image,
+                    torch.zeros(
+                        control_image.shape[0],
+                        self.controlnet.config.control_in_dim - num_channels_latents,
+                        *control_image.shape[2:],
+                    ).to(device=control_image.device, dtype=control_image.dtype),
+                ],
+                dim=1,
+            )
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # Repeat prompt_embeds for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+            if self.do_classifier_free_guidance and negative_prompt_embeds:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+
+        actual_batch_size = batch_size * num_images_per_prompt
+        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
+
+        # 5. Prepare timesteps
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        self.scheduler.sigma_min = 0.0
+        scheduler_kwargs = {"mu": mu}
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0])
+                timestep = (1000 - timestep) / 1000
+                # Normalized time for time-aware config (0 at start, 1 at end)
+                t_norm = timestep[0].item()
+
+                # Handle cfg truncation
+                current_guidance_scale = self.guidance_scale
+                if (
+                    self.do_classifier_free_guidance
+                    and self._cfg_truncation is not None
+                    and float(self._cfg_truncation) <= 1
+                ):
+                    if t_norm > self._cfg_truncation:
+                        current_guidance_scale = 0.0
+
+                # Run CFG only if configured AND scale is non-zero
+                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+
+                if apply_cfg:
+                    latents_typed = latents.to(self.transformer.dtype)
+                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+                    timestep_model_input = timestep.repeat(2)
+                else:
+                    latent_model_input = latents.to(self.transformer.dtype)
+                    prompt_embeds_model_input = prompt_embeds
+                    timestep_model_input = timestep
+
+                latent_model_input = latent_model_input.unsqueeze(2)
+                latent_model_input_list = list(latent_model_input.unbind(dim=0))
+
+                controlnet_block_samples = self.controlnet(
+                    latent_model_input_list,
+                    timestep_model_input,
+                    prompt_embeds_model_input,
+                    control_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                )
+
+                model_out_list = self.transformer(
+                    latent_model_input_list,
+                    timestep_model_input,
+                    prompt_embeds_model_input,
+                    controlnet_block_samples=controlnet_block_samples,
+                )[0]
+
+                if apply_cfg:
+                    # Perform CFG
+                    pos_out = model_out_list[:actual_batch_size]
+                    neg_out = model_out_list[actual_batch_size:]
+
+                    noise_pred = []
+                    for j in range(actual_batch_size):
+                        pos = pos_out[j].float()
+                        neg = neg_out[j].float()
+
+                        pred = pos + current_guidance_scale * (pos - neg)
+
+                        # Renormalization
+                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                            ori_pos_norm = torch.linalg.vector_norm(pos)
+                            new_pos_norm = torch.linalg.vector_norm(pred)
+                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                            if new_pos_norm > max_new_norm:
+                                pred = pred * (max_new_norm / new_pos_norm)
+
+                        noise_pred.append(pred)
+
+                    noise_pred = torch.stack(noise_pred, dim=0)
+                else:
+                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+
+                noise_pred = noise_pred.squeeze(2)
+                noise_pred = -noise_pred
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+                assert latents.dtype == torch.float32
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ZImagePipelineOutput(images=image)
diff --git a/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py
new file mode 100644
index 0000000000..73ea7d0fdd
--- /dev/null
+++ b/src/diffusers/pipelines/z_image/pipeline_z_image_controlnet_inpaint.py
@@ -0,0 +1,747 @@
+# Copyright 2025 Alibaba Z-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, PreTrainedModel
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models.autoencoders import AutoencoderKL
+from ...models.controlnets import ZImageControlNetModel
+from ...models.transformers import ZImageTransformer2DModel
+from ...pipelines.pipeline_utils import DiffusionPipeline
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from .pipeline_output import ZImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import ZImageControlNetInpaintPipeline
+        >>> from diffusers import ZImageControlNetModel
+        >>> from diffusers.utils import load_image
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> controlnet = ZImageControlNetModel.from_single_file(
+        ...     hf_hub_download(
+        ...         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        ...         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.1.safetensors",
+        ...     ),
+        ...     torch_dtype=torch.bfloat16,
+        ... )
+
+        >>> # 2.0 - `config` is required
+        >>> # controlnet = ZImageControlNetModel.from_single_file(
+        >>> #     hf_hub_download(
+        >>> #         "alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        >>> #         filename="Z-Image-Turbo-Fun-Controlnet-Union-2.0.safetensors",
+        >>> #     ),
+        >>> #     torch_dtype=torch.bfloat16,
+        >>> #     config="hlky/Z-Image-Turbo-Fun-Controlnet-Union-2.0",
+        >>> # )
+
+        >>> pipe = ZImageControlNetInpaintPipeline.from_pretrained(
+        ...     "Tongyi-MAI/Z-Image-Turbo", controlnet=controlnet, torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> # Optionally, set the attention backend to flash-attn 2 or 3, default is SDPA in PyTorch.
+        >>> # (1) Use flash attention 2
+        >>> # pipe.transformer.set_attention_backend("flash")
+        >>> # (2) Use flash attention 3
+        >>> # pipe.transformer.set_attention_backend("_flash_3")
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/inpaint.jpg?download=true"
+        ... )
+        >>> mask_image = load_image(
+        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/mask.jpg?download=true"
+        ... )
+        >>> control_image = load_image(
+        ...     "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0/resolve/main/asset/pose.jpg?download=true"
+        ... )
+        >>> prompt = "一位年轻女子站在阳光明媚的海岸线上，画面为全身竖构图，身体微微侧向右侧，左手自然下垂，右臂弯曲扶在腰间，她的手指清晰可见，站姿放松而略带羞涩。她身穿轻盈的白色连衣裙，裙摆在海风中轻轻飘动，布料半透、质感柔软。女子拥有一头鲜艳的及腰紫色长发，被海风吹起，在身侧轻盈飞舞，发间系着一个精致的黑色蝴蝶结，与发色形成对比。她面容清秀，眉目精致，肤色白皙细腻，表情温柔略显羞涩，微微低头，眼神静静望向远处的海平线，流露出甜美的青春气息与若有所思的神情。背景是辽阔无垠的海洋与蔚蓝天空，阳光从侧前方洒下，海面波光粼粼，泛着温暖的金色光晕，天空清澈明亮，云朵稀薄，整体色调清新唯美。"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ...     controlnet_conditioning_scale=0.75,
+        ...     height=1728,
+        ...     width=992,
+        ...     num_inference_steps=25,
+        ...     guidance_scale=0.0,
+        ...     generator=torch.Generator("cuda").manual_seed(43),
+        ... ).images[0]
+        >>> image.save("zimage-inpaint.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class ZImageControlNetInpaintPipeline(DiffusionPipeline, FromSingleFileMixin):
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: PreTrainedModel,
+        tokenizer: AutoTokenizer,
+        transformer: ZImageTransformer2DModel,
+        controlnet: ZImageControlNetModel,
+    ):
+        super().__init__()
+        if transformer.in_channels == controlnet.config.control_in_dim:
+            raise ValueError(
+                "ZImageControlNetInpaintPipeline is not compatible with `alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union`, use `alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union-2.0`."
+            )
+        controlnet = ZImageControlNetModel.from_transformer(controlnet, transformer)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            transformer=transformer,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+    ):
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt_embeds = self._encode_prompt(
+            prompt=prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+
+        if do_classifier_free_guidance:
+            if negative_prompt is None:
+                negative_prompt = ["" for _ in prompt]
+            else:
+                negative_prompt = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            assert len(prompt) == len(negative_prompt)
+            negative_prompt_embeds = self._encode_prompt(
+                prompt=negative_prompt,
+                device=device,
+                prompt_embeds=negative_prompt_embeds,
+                max_sequence_length=max_sequence_length,
+            )
+        else:
+            negative_prompt_embeds = []
+        return prompt_embeds, negative_prompt_embeds
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        max_sequence_length: int = 512,
+    ) -> List[torch.FloatTensor]:
+        device = device or self._execution_device
+
+        if prompt_embeds is not None:
+            return prompt_embeds
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        for i, prompt_item in enumerate(prompt):
+            messages = [
+                {"role": "user", "content": prompt_item},
+            ]
+            prompt_item = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=True,
+            )
+            prompt[i] = prompt_item
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids.to(device)
+        prompt_masks = text_inputs.attention_mask.to(device).bool()
+
+        prompt_embeds = self.text_encoder(
+            input_ids=text_input_ids,
+            attention_mask=prompt_masks,
+            output_hidden_states=True,
+        ).hidden_states[-2]
+
+        embeddings_list = []
+
+        for i in range(len(prompt_embeds)):
+            embeddings_list.append(prompt_embeds[i][prompt_masks[i]])
+
+        return embeddings_list
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+
+        shape = (batch_size, num_channels_latents, height, width)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        return latents
+
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 5.0,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.75,
+        cfg_normalization: bool = False,
+        cfg_truncation: float = 1.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        negative_prompt_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            cfg_normalization (`bool`, *optional*, defaults to False):
+                Whether to apply configuration normalization.
+            cfg_truncation (`float`, *optional*, defaults to 1.0):
+                The truncation value for configuration.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.ZImagePipelineOutput`] instead of a plain
+                tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.z_image.ZImagePipelineOutput`] or `tuple`: [`~pipelines.z_image.ZImagePipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        height = height or 1024
+        width = width or 1024
+
+        vae_scale = self.vae_scale_factor * 2
+        if height % vae_scale != 0:
+            raise ValueError(
+                f"Height must be divisible by {vae_scale} (got {height}). "
+                f"Please adjust the height to a multiple of {vae_scale}."
+            )
+        if width % vae_scale != 0:
+            raise ValueError(
+                f"Width must be divisible by {vae_scale} (got {width}). "
+                f"Please adjust the width to a multiple of {vae_scale}."
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        self._cfg_normalization = cfg_normalization
+        self._cfg_truncation = cfg_truncation
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = len(prompt_embeds)
+
+        # If prompt_embeds is provided and prompt is None, skip encoding
+        if prompt_embeds is not None and prompt is None:
+            if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+                raise ValueError(
+                    "When `prompt_embeds` is provided without `prompt`, "
+                    "`negative_prompt_embeds` must also be provided for classifier-free guidance."
+                )
+        else:
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                device=device,
+                max_sequence_length=max_sequence_length,
+            )
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.in_channels
+
+        control_image = self.prepare_image(
+            image=control_image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.vae.dtype,
+        )
+        height, width = control_image.shape[-2:]
+        control_image = retrieve_latents(self.vae.encode(control_image), generator=generator, sample_mode="argmax")
+        control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        control_image = control_image.unsqueeze(2)
+
+        mask_condition = self.mask_processor.preprocess(mask_image, height=height, width=width)
+        mask_condition = torch.tile(mask_condition, [1, 3, 1, 1]).to(
+            device=control_image.device, dtype=control_image.dtype
+        )
+
+        init_image = self.prepare_image(
+            image=image,
+            width=width,
+            height=height,
+            batch_size=batch_size * num_images_per_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            dtype=self.vae.dtype,
+        )
+        height, width = init_image.shape[-2:]
+        init_image = init_image * (mask_condition < 0.5)
+        init_image = retrieve_latents(self.vae.encode(init_image), generator=generator, sample_mode="argmax")
+        init_image = (init_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        init_image = init_image.unsqueeze(2)
+
+        mask_condition = F.interpolate(1 - mask_condition[:, :1], size=init_image.size()[-2:], mode="nearest").to(
+            device=control_image.device, dtype=control_image.dtype
+        )
+        mask_condition = mask_condition.unsqueeze(2)
+
+        control_image = torch.cat([control_image, mask_condition, init_image], dim=1)
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+
+        # Repeat prompt_embeds for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = [pe for pe in prompt_embeds for _ in range(num_images_per_prompt)]
+            if self.do_classifier_free_guidance and negative_prompt_embeds:
+                negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)]
+
+        actual_batch_size = batch_size * num_images_per_prompt
+        image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2)
+
+        # 5. Prepare timesteps
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        self.scheduler.sigma_min = 0.0
+        scheduler_kwargs = {"mu": mu}
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0])
+                timestep = (1000 - timestep) / 1000
+                # Normalized time for time-aware config (0 at start, 1 at end)
+                t_norm = timestep[0].item()
+
+                # Handle cfg truncation
+                current_guidance_scale = self.guidance_scale
+                if (
+                    self.do_classifier_free_guidance
+                    and self._cfg_truncation is not None
+                    and float(self._cfg_truncation) <= 1
+                ):
+                    if t_norm > self._cfg_truncation:
+                        current_guidance_scale = 0.0
+
+                # Run CFG only if configured AND scale is non-zero
+                apply_cfg = self.do_classifier_free_guidance and current_guidance_scale > 0
+
+                if apply_cfg:
+                    latents_typed = latents.to(self.transformer.dtype)
+                    latent_model_input = latents_typed.repeat(2, 1, 1, 1)
+                    prompt_embeds_model_input = prompt_embeds + negative_prompt_embeds
+                    timestep_model_input = timestep.repeat(2)
+                else:
+                    latent_model_input = latents.to(self.transformer.dtype)
+                    prompt_embeds_model_input = prompt_embeds
+                    timestep_model_input = timestep
+
+                latent_model_input = latent_model_input.unsqueeze(2)
+                latent_model_input_list = list(latent_model_input.unbind(dim=0))
+
+                controlnet_block_samples = self.controlnet(
+                    latent_model_input_list,
+                    timestep_model_input,
+                    prompt_embeds_model_input,
+                    control_image,
+                    conditioning_scale=controlnet_conditioning_scale,
+                )
+
+                model_out_list = self.transformer(
+                    latent_model_input_list,
+                    timestep_model_input,
+                    prompt_embeds_model_input,
+                    controlnet_block_samples=controlnet_block_samples,
+                )[0]
+
+                if apply_cfg:
+                    # Perform CFG
+                    pos_out = model_out_list[:actual_batch_size]
+                    neg_out = model_out_list[actual_batch_size:]
+
+                    noise_pred = []
+                    for j in range(actual_batch_size):
+                        pos = pos_out[j].float()
+                        neg = neg_out[j].float()
+
+                        pred = pos + current_guidance_scale * (pos - neg)
+
+                        # Renormalization
+                        if self._cfg_normalization and float(self._cfg_normalization) > 0.0:
+                            ori_pos_norm = torch.linalg.vector_norm(pos)
+                            new_pos_norm = torch.linalg.vector_norm(pred)
+                            max_new_norm = ori_pos_norm * float(self._cfg_normalization)
+                            if new_pos_norm > max_new_norm:
+                                pred = pred * (max_new_norm / new_pos_norm)
+
+                        noise_pred.append(pred)
+
+                    noise_pred = torch.stack(noise_pred, dim=0)
+                else:
+                    noise_pred = torch.stack([t.float() for t in model_out_list], dim=0)
+
+                noise_pred = noise_pred.squeeze(2)
+                noise_pred = -noise_pred
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred.to(torch.float32), t, latents, return_dict=False)[0]
+                assert latents.dtype == torch.float32
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "latent":
+            image = latents
+
+        else:
+            latents = latents.to(self.vae.dtype)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ZImagePipelineOutput(images=image)
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 606fea18c7..eb956a96a3 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -1777,6 +1777,21 @@ class WanVACETransformer3DModel(metaclass=DummyObject):
         requires_backends(cls, ["torch"])
 
 
+class ZImageControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class ZImageTransformer2DModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index cd51d3a567..74a4146bfd 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -3857,6 +3857,36 @@ class WuerstchenPriorPipeline(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class ZImageControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ZImageControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class ZImageImg2ImgPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 

From b5309683cb6753e2111be9a8204f90a550c3fcb6 Mon Sep 17 00:00:00 2001
From: Miguel Martin <miguel@miguel-martin.com>
Date: Thu, 18 Dec 2025 16:08:18 -0800
Subject: [PATCH 09/11] Cosmos Predict2.5 Base: inference pipeline, scheduler &
 chkpt conversion (#12852)

* cosmos predict2.5 base: convert chkpt & pipeline
- New scheduler: scheduling_flow_unipc_multistep.py
- Changes to TransformerCosmos for text embeddings via crossattn_proj

* scheduler cleanup

* simplify inference pipeline

* cleanup scheduler + tests

* Basic tests for flow unipc

* working b2b inference

* Rename everything

* Tests for pipeline present, but not working (predict2 also not working)

* docstring update

* wrapper pipelines + make style

* remove unnecessary files

* UniPCMultistep: support use_karras_sigmas=True and use_flow_sigmas=True

* use UniPCMultistepScheduler + fix tests for pipeline

* Remove FlowUniPCMultistepScheduler

* UniPCMultistepScheduler for use_flow_sigmas=True & use_karras_sigmas=True

* num_inference_steps=36 due to bug in scheduler used by predict2.5

* Address comments

* make style + make fix-copies

* fix tests + remove references to old pipelines

* address comments

* add revision in from_pretrained call

* fix tests
---
 docs/source/en/api/pipelines/cosmos.md        |   6 +
 scripts/convert_cosmos_to_diffusers.py        | 135 ++-
 src/diffusers/__init__.py                     |   2 +
 .../models/transformers/transformer_cosmos.py |  13 +
 src/diffusers/pipelines/__init__.py           |   2 +
 src/diffusers/pipelines/cosmos/__init__.py    |   6 +
 .../cosmos/pipeline_cosmos2_5_predict.py      | 847 ++++++++++++++++++
 .../schedulers/scheduling_unipc_multistep.py  |   9 +-
 .../dummy_torch_and_transformers_objects.py   |  15 +
 tests/pipelines/cosmos/cosmos_guardrail.py    |  11 +-
 .../cosmos/test_cosmos2_5_predict.py          | 337 +++++++
 tests/schedulers/test_scheduler_unipc.py      |  29 +
 12 files changed, 1398 insertions(+), 14 deletions(-)
 create mode 100644 src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
 create mode 100644 tests/pipelines/cosmos/test_cosmos2_5_predict.py

diff --git a/docs/source/en/api/pipelines/cosmos.md b/docs/source/en/api/pipelines/cosmos.md
index fb9453480e..60ecce6603 100644
--- a/docs/source/en/api/pipelines/cosmos.md
+++ b/docs/source/en/api/pipelines/cosmos.md
@@ -70,6 +70,12 @@ output.save("output.png")
   - all
   - __call__
 
+## Cosmos2_5_PredictBasePipeline
+
+[[autodoc]] Cosmos2_5_PredictBasePipeline
+  - all
+  - __call__
+
 ## CosmosPipelineOutput
 
 [[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
index 6f6563ad64..6e70f8cc05 100644
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -1,11 +1,55 @@
+"""
+# Cosmos 2 Predict
+
+Download checkpoint
+```bash
+hf download nvidia/Cosmos-Predict2-2B-Text2Image
+```
+
+convert checkpoint
+```bash
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2-2B-Text2Image/snapshots/acdb5fde992a73ef0355f287977d002cbfd127e0/model.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --transformer_type Cosmos-2.0-Diffusion-2B-Text2Image \
+    --text_encoder_path google-t5/t5-11b \
+    --tokenizer_path google-t5/t5-11b \
+    --vae_type wan2.1 \
+    --output_path converted/cosmos-p2-t2i-2b \
+    --save_pipeline
+```
+
+# Cosmos 2.5 Predict
+
+Download checkpoint
+```bash
+hf download nvidia/Cosmos-Predict2.5-2B
+```
+
+Convert checkpoint
+```bash
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/cosmos-p2.5-base-2b \
+    --save_pipeline
+```
+
+"""
+
 import argparse
 import pathlib
+import sys
 from typing import Any, Dict
 
 import torch
 from accelerate import init_empty_weights
 from huggingface_hub import snapshot_download
-from transformers import T5EncoderModel, T5TokenizerFast
+from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration, T5EncoderModel, T5TokenizerFast
 
 from diffusers import (
     AutoencoderKLCosmos,
@@ -17,7 +61,9 @@ from diffusers import (
     CosmosVideoToWorldPipeline,
     EDMEulerScheduler,
     FlowMatchEulerDiscreteScheduler,
+    UniPCMultistepScheduler,
 )
+from diffusers.pipelines.cosmos.pipeline_cosmos2_5_predict import Cosmos2_5_PredictBasePipeline
 
 
 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -233,6 +279,25 @@ TRANSFORMER_CONFIGS = {
         "concat_padding_mask": True,
         "extra_pos_embed_type": None,
     },
+    "Cosmos-2.5-Predict-Base-2B": {
+        "in_channels": 16 + 1,
+        "out_channels": 16,
+        "num_attention_heads": 16,
+        "attention_head_dim": 128,
+        "num_layers": 28,
+        "mlp_ratio": 4.0,
+        "text_embed_dim": 1024,
+        "adaln_lora_dim": 256,
+        "max_size": (128, 240, 240),
+        "patch_size": (1, 2, 2),
+        "rope_scale": (1.0, 3.0, 3.0),
+        "concat_padding_mask": True,
+        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
+        "extra_pos_embed_type": None,
+        "use_crossattn_projection": True,
+        "crossattn_proj_in_channels": 100352,
+        "encoder_hidden_states_channels": 1024,
+    },
 }
 
 VAE_KEYS_RENAME_DICT = {
@@ -334,6 +399,9 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
     elif "Cosmos-2.0" in transformer_type:
         TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
         TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
+    elif "Cosmos-2.5" in transformer_type:
+        TRANSFORMER_KEYS_RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT_COSMOS_2_0
+        TRANSFORMER_SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP_COSMOS_2_0
     else:
         assert False
 
@@ -347,6 +415,7 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
             new_key = new_key.removeprefix(PREFIX_KEY)
         for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
             new_key = new_key.replace(replace_key, rename_key)
+        print(key, "->", new_key, flush=True)
         update_state_dict_(original_state_dict, key, new_key)
 
     for key in list(original_state_dict.keys()):
@@ -355,6 +424,21 @@ def convert_transformer(transformer_type: str, ckpt_path: str, weights_only: boo
                 continue
             handler_fn_inplace(key, original_state_dict)
 
+    expected_keys = set(transformer.state_dict().keys())
+    mapped_keys = set(original_state_dict.keys())
+    missing_keys = expected_keys - mapped_keys
+    unexpected_keys = mapped_keys - expected_keys
+    if missing_keys:
+        print(f"ERROR: missing keys ({len(missing_keys)} from state_dict:", flush=True, file=sys.stderr)
+        for k in missing_keys:
+            print(k)
+        sys.exit(1)
+    if unexpected_keys:
+        print(f"ERROR: unexpected keys ({len(unexpected_keys)}) from state_dict:", flush=True, file=sys.stderr)
+        for k in unexpected_keys:
+            print(k)
+        sys.exit(2)
+
     transformer.load_state_dict(original_state_dict, strict=True, assign=True)
     return transformer
 
@@ -444,6 +528,34 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
     pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
 
 
+def save_pipeline_cosmos2_5(args, transformer, vae):
+    text_encoder_path = args.text_encoder_path or "nvidia/Cosmos-Reason1-7B"
+    tokenizer_path = args.tokenizer_path or "Qwen/Qwen2.5-VL-7B-Instruct"
+
+    text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        text_encoder_path, torch_dtype="auto", device_map="cpu"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    scheduler = UniPCMultistepScheduler(
+        use_karras_sigmas=True,
+        use_flow_sigmas=True,
+        prediction_type="flow_prediction",
+        sigma_max=200.0,
+        sigma_min=0.01,
+    )
+
+    pipe = Cosmos2_5_PredictBasePipeline(
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        transformer=transformer,
+        vae=vae,
+        scheduler=scheduler,
+        safety_checker=lambda *args, **kwargs: None,
+    )
+    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
+
+
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
@@ -451,10 +563,10 @@ def get_args():
         "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
     )
     parser.add_argument(
-        "--vae_type", type=str, default=None, choices=["none", *list(VAE_CONFIGS.keys())], help="Type of VAE"
+        "--vae_type", type=str, default="wan2.1", choices=["wan2.1", *list(VAE_CONFIGS.keys())], help="Type of VAE"
     )
-    parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
-    parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
+    parser.add_argument("--text_encoder_path", type=str, default=None)
+    parser.add_argument("--tokenizer_path", type=str, default=None)
     parser.add_argument("--save_pipeline", action="store_true")
     parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
     parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
@@ -477,8 +589,6 @@ if __name__ == "__main__":
     if args.save_pipeline:
         assert args.transformer_ckpt_path is not None
         assert args.vae_type is not None
-        assert args.text_encoder_path is not None
-        assert args.tokenizer_path is not None
 
     if args.transformer_ckpt_path is not None:
         weights_only = "Cosmos-1.0" in args.transformer_type
@@ -490,17 +600,26 @@ if __name__ == "__main__":
     if args.vae_type is not None:
         if "Cosmos-1.0" in args.transformer_type:
             vae = convert_vae(args.vae_type)
-        else:
+        elif "Cosmos-2.0" in args.transformer_type or "Cosmos-2.5" in args.transformer_type:
             vae = AutoencoderKLWan.from_pretrained(
                 "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", subfolder="vae", torch_dtype=torch.float32
             )
+        else:
+            raise AssertionError(f"{args.transformer_type} not supported")
+
         if not args.save_pipeline:
             vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
 
     if args.save_pipeline:
         if "Cosmos-1.0" in args.transformer_type:
+            assert args.text_encoder_path is not None
+            assert args.tokenizer_path is not None
             save_pipeline_cosmos_1_0(args, transformer, vae)
         elif "Cosmos-2.0" in args.transformer_type:
+            assert args.text_encoder_path is not None
+            assert args.tokenizer_path is not None
             save_pipeline_cosmos_2_0(args, transformer, vae)
+        elif "Cosmos-2.5" in args.transformer_type:
+            save_pipeline_cosmos2_5(args, transformer, vae)
         else:
-            assert False
+            raise AssertionError(f"{args.transformer_type} not supported")
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 03ecaf6bc1..6aac3feffd 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -463,6 +463,7 @@ else:
             "CogView4ControlPipeline",
             "CogView4Pipeline",
             "ConsisIDPipeline",
+            "Cosmos2_5_PredictBasePipeline",
             "Cosmos2TextToImagePipeline",
             "Cosmos2VideoToWorldPipeline",
             "CosmosTextToWorldPipeline",
@@ -1175,6 +1176,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             CogView4ControlPipeline,
             CogView4Pipeline,
             ConsisIDPipeline,
+            Cosmos2_5_PredictBasePipeline,
             Cosmos2TextToImagePipeline,
             Cosmos2VideoToWorldPipeline,
             CosmosTextToWorldPipeline,
diff --git a/src/diffusers/models/transformers/transformer_cosmos.py b/src/diffusers/models/transformers/transformer_cosmos.py
index 373b470ae3..2b0c266707 100644
--- a/src/diffusers/models/transformers/transformer_cosmos.py
+++ b/src/diffusers/models/transformers/transformer_cosmos.py
@@ -439,6 +439,9 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
         concat_padding_mask: bool = True,
         extra_pos_embed_type: Optional[str] = "learnable",
+        use_crossattn_projection: bool = False,
+        crossattn_proj_in_channels: int = 1024,
+        encoder_hidden_states_channels: int = 1024,
     ) -> None:
         super().__init__()
         hidden_size = num_attention_heads * attention_head_dim
@@ -485,6 +488,12 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
             hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
         )
 
+        if self.config.use_crossattn_projection:
+            self.crossattn_proj = nn.Sequential(
+                nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
+                nn.GELU(),
+            )
+
         self.gradient_checkpointing = False
 
     def forward(
@@ -524,6 +533,7 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         post_patch_num_frames = num_frames // p_t
         post_patch_height = height // p_h
         post_patch_width = width // p_w
+
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = hidden_states.flatten(1, 3)  # [B, T, H, W, C] -> [B, THW, C]
 
@@ -546,6 +556,9 @@ class CosmosTransformer3DModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         else:
             assert False
 
+        if self.config.use_crossattn_projection:
+            encoder_hidden_states = self.crossattn_proj(encoder_hidden_states)
+
         # 5. Transformer blocks
         for block in self.transformer_blocks:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 04ec6b5cd8..e8faf868e7 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -165,6 +165,7 @@ else:
     _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
     _import_structure["consisid"] = ["ConsisIDPipeline"]
     _import_structure["cosmos"] = [
+        "Cosmos2_5_PredictBasePipeline",
         "Cosmos2TextToImagePipeline",
         "CosmosTextToWorldPipeline",
         "CosmosVideoToWorldPipeline",
@@ -622,6 +623,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableDiffusionXLControlNetXSPipeline,
         )
         from .cosmos import (
+            Cosmos2_5_PredictBasePipeline,
             Cosmos2TextToImagePipeline,
             Cosmos2VideoToWorldPipeline,
             CosmosTextToWorldPipeline,
diff --git a/src/diffusers/pipelines/cosmos/__init__.py b/src/diffusers/pipelines/cosmos/__init__.py
index 2833c89abd..944f165531 100644
--- a/src/diffusers/pipelines/cosmos/__init__.py
+++ b/src/diffusers/pipelines/cosmos/__init__.py
@@ -22,6 +22,9 @@ except OptionalDependencyNotAvailable:
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["pipeline_cosmos2_5_predict"] = [
+        "Cosmos2_5_PredictBasePipeline",
+    ]
     _import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
     _import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
     _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
@@ -35,6 +38,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
+        from .pipeline_cosmos2_5_predict import (
+            Cosmos2_5_PredictBasePipeline,
+        )
         from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
         from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
         from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
new file mode 100644
index 0000000000..6564b59373
--- /dev/null
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -0,0 +1,847 @@
+# Copyright 2025 The NVIDIA Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms
+import torchvision.transforms.functional
+from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...image_processor import PipelineImageInput
+from ...models import AutoencoderKLWan, CosmosTransformer3DModel
+from ...schedulers import UniPCMultistepScheduler
+from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import CosmosPipelineOutput
+
+
+if is_cosmos_guardrail_available():
+    from cosmos_guardrail import CosmosSafetyChecker
+else:
+
+    class CosmosSafetyChecker:
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
+            )
+
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import Cosmos2_5_PredictBasePipeline
+        >>> from diffusers.utils import export_to_video, load_image, load_video
+
+        >>> model_id = "nvidia/Cosmos-Predict2.5-2B"
+        >>> pipe = Cosmos2_5_PredictBasePipeline.from_pretrained(
+        ...     model_id, revision="diffusers/base/pre-trianed", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> # Common negative prompt reused across modes.
+        >>> negative_prompt = (
+        ...     "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+        ...     "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+        ...     "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "
+        ...     "movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+        ...     "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+        ...     "Overall, the video is of poor quality."
+        ... )
+
+        >>> # Text2World: generate a 93-frame world video from text only.
+        >>> prompt = (
+        ...     "As the red light shifts to green, the red bus at the intersection begins to move forward, its headlights "
+        ...     "cutting through the falling snow. The snowy tire tracks deepen as the vehicle inches ahead, casting fresh "
+        ...     "lines onto the slushy road. Around it, streetlights glow warmer, illuminating the drifting flakes and wet "
+        ...     "reflections on the asphalt. Other cars behind start to edge forward, their beams joining the scene. "
+        ...     "The stillness of the urban street transitions into motion as the quiet snowfall is punctuated by the slow "
+        ...     "advance of traffic through the frosty city corridor."
+        ... )
+        >>> video = pipe(
+        ...     image=None,
+        ...     video=None,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_frames=93,
+        ...     generator=torch.Generator().manual_seed(1),
+        ... ).frames[0]
+        >>> export_to_video(video, "text2world.mp4", fps=16)
+
+        >>> # Image2World: condition on a single image and generate a 93-frame world video.
+        >>> prompt = (
+        ...     "A high-definition video captures the precision of robotic welding in an industrial setting. "
+        ...     "The first frame showcases a robotic arm, equipped with a welding torch, positioned over a large metal structure. "
+        ...     "The welding process is in full swing, with bright sparks and intense light illuminating the scene, creating a vivid "
+        ...     "display of blue and white hues. A significant amount of smoke billows around the welding area, partially obscuring "
+        ...     "the view but emphasizing the heat and activity. The background reveals parts of the workshop environment, including a "
+        ...     "ventilation system and various pieces of machinery, indicating a busy and functional industrial workspace. As the video "
+        ...     "progresses, the robotic arm maintains its steady position, continuing the welding process and moving to its left. "
+        ...     "The welding torch consistently emits sparks and light, and the smoke continues to rise, diffusing slightly as it moves upward. "
+        ...     "The metal surface beneath the torch shows ongoing signs of heating and melting. The scene retains its industrial ambiance, with "
+        ...     "the welding sparks and smoke dominating the visual field, underscoring the ongoing nature of the welding operation."
+        ... )
+        >>> image = load_image(
+        ...     "https://media.githubusercontent.com/media/nvidia-cosmos/cosmos-predict2.5/refs/heads/main/assets/base/robot_welding.jpg"
+        ... )
+        >>> video = pipe(
+        ...     image=image,
+        ...     video=None,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_frames=93,
+        ...     generator=torch.Generator().manual_seed(1),
+        ... ).frames[0]
+        >>> # export_to_video(video, "image2world.mp4", fps=16)
+
+        >>> # Video2World: condition on an input clip and predict a 93-frame world video.
+        >>> prompt = (
+        ...     "The video opens with an aerial view of a large-scale sand mining construction operation, showcasing extensive piles "
+        ...     "of brown sand meticulously arranged in parallel rows. A central water channel, fed by a water pipe, flows through the "
+        ...     "middle of these sand heaps, creating ripples and movement as it cascades down. The surrounding area features dense green "
+        ...     "vegetation on the left, contrasting with the sandy terrain, while a body of water is visible in the background on the right. "
+        ...     "As the video progresses, a piece of heavy machinery, likely a bulldozer, enters the frame from the right, moving slowly along "
+        ...     "the edge of the sand piles. This machinery's presence indicates ongoing construction work in the operation. The final frame "
+        ...     "captures the same scene, with the water continuing its flow and the bulldozer still in motion, maintaining the dynamic yet "
+        ...     "steady pace of the construction activity."
+        ... )
+        >>> input_video = load_video(
+        ...     "https://github.com/nvidia-cosmos/cosmos-predict2.5/raw/refs/heads/main/assets/base/sand_mining.mp4"
+        ... )
+        >>> video = pipe(
+        ...     image=None,
+        ...     video=input_video,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_frames=93,
+        ...     generator=torch.Generator().manual_seed(1),
+        ... ).frames[0]
+        >>> export_to_video(video, "video2world.mp4", fps=16)
+
+        >>> # To produce an image instead of a world (video) clip, set num_frames=1 and
+        >>> # save the first frame: pipe(..., num_frames=1).frames[0][0].
+        ```
+"""
+
+
+class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for [Cosmos Predict2.5](https://github.com/nvidia-cosmos/cosmos-predict2.5) base model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
+            Frozen text-encoder. Cosmos Predict2.5 uses the [Qwen2.5
+            VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) encoder.
+        tokenizer (`AutoTokenizer`):
+            Tokenizer associated with the Qwen2.5 VL encoder.
+        transformer ([`CosmosTransformer3DModel`]):
+            Conditional Transformer to denoise the encoded image latents.
+        scheduler ([`UniPCMultistepScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKLWan`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    # We mark safety_checker as optional here to get around some test failures, but it is not really optional
+    _optional_components = ["safety_checker"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: AutoTokenizer,
+        transformer: CosmosTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: UniPCMultistepScheduler,
+        safety_checker: CosmosSafetyChecker = None,
+    ):
+        super().__init__()
+
+        if safety_checker is None:
+            safety_checker = CosmosSafetyChecker()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+        )
+
+        self.vae_scale_factor_temporal = 2 ** sum(self.vae.temperal_downsample) if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+        latents_mean = (
+            torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).float()
+            if getattr(self.vae.config, "latents_mean", None) is not None
+            else None
+        )
+        latents_std = (
+            torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).float()
+            if getattr(self.vae.config, "latents_std", None) is not None
+            else None
+        )
+        self.latents_mean = latents_mean
+        self.latents_std = latents_std
+
+        if self.latents_mean is None or self.latents_std is None:
+            raise ValueError("VAE configuration must define both `latents_mean` and `latents_std`.")
+
+    def _get_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        input_ids_batch = []
+
+        for sample_idx in range(len(prompt)):
+            conversations = [
+                {
+                    "role": "system",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "You are a helpful assistant who will provide prompts to an image generator.",
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt[sample_idx],
+                        }
+                    ],
+                },
+            ]
+            input_ids = self.tokenizer.apply_chat_template(
+                conversations,
+                tokenize=True,
+                add_generation_prompt=False,
+                add_vision_id=False,
+                max_length=max_sequence_length,
+                truncation=True,
+                padding="max_length",
+            )
+            input_ids = torch.LongTensor(input_ids)
+            input_ids_batch.append(input_ids)
+
+        input_ids_batch = torch.stack(input_ids_batch, dim=0)
+
+        outputs = self.text_encoder(
+            input_ids_batch.to(device),
+            output_hidden_states=True,
+        )
+        hidden_states = outputs.hidden_states
+
+        normalized_hidden_states = []
+        for layer_idx in range(1, len(hidden_states)):
+            normalized_state = (hidden_states[layer_idx] - hidden_states[layer_idx].mean(dim=-1, keepdim=True)) / (
+                hidden_states[layer_idx].std(dim=-1, keepdim=True) + 1e-8
+            )
+            normalized_hidden_states.append(normalized_state)
+
+        prompt_embeds = torch.cat(normalized_hidden_states, dim=-1)
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        return prompt_embeds
+
+    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_prompt_embeds(
+                prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
+            )
+
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            _, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_prompt_embeds(
+                prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
+            )
+
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            _, seq_len, _ = negative_prompt_embeds.shape
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Modified from diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2VideoToWorldPipeline.prepare_latents and
+    # diffusers.pipelines.cosmos.pipeline_cosmos2_video2world.Cosmos2TextToImagePipeline.prepare_latents
+    def prepare_latents(
+        self,
+        video: Optional[torch.Tensor],
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 704,
+        width: int = 1280,
+        num_frames_in: int = 93,
+        num_frames_out: int = 93,
+        do_classifier_free_guidance: bool = True,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        B = batch_size
+        C = num_channels_latents
+        T = (num_frames_out - 1) // self.vae_scale_factor_temporal + 1
+        H = height // self.vae_scale_factor_spatial
+        W = width // self.vae_scale_factor_spatial
+        shape = (B, C, T, H, W)
+
+        if num_frames_in == 0:
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+            cond_mask = torch.zeros((B, 1, T, H, W), dtype=latents.dtype, device=latents.device)
+            cond_indicator = torch.zeros((B, 1, T, 1, 1), dtype=latents.dtype, device=latents.device)
+
+            cond_latents = torch.zeros_like(latents)
+
+            return (
+                latents,
+                cond_latents,
+                cond_mask,
+                cond_indicator,
+            )
+        else:
+            if video is None:
+                raise ValueError("`video` must be provided when `num_frames_in` is greater than 0.")
+            needs_preprocessing = not (isinstance(video, torch.Tensor) and video.ndim == 5 and video.shape[1] == 3)
+            if needs_preprocessing:
+                video = self.video_processor.preprocess_video(video, height, width)
+            video = video.to(device=device, dtype=self.vae.dtype)
+            if isinstance(generator, list):
+                cond_latents = [
+                    retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+            else:
+                cond_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
+
+            cond_latents = torch.cat(cond_latents, dim=0).to(dtype)
+
+            latents_mean = self.latents_mean.to(device=device, dtype=dtype)
+            latents_std = self.latents_std.to(device=device, dtype=dtype)
+            cond_latents = (cond_latents - latents_mean) / latents_std
+
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device=device, dtype=dtype)
+
+            padding_shape = (B, 1, T, H, W)
+            ones_padding = latents.new_ones(padding_shape)
+            zeros_padding = latents.new_zeros(padding_shape)
+
+            num_cond_latent_frames = (num_frames_in - 1) // self.vae_scale_factor_temporal + 1
+            cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
+            cond_indicator[:, :, 0:num_cond_latent_frames] = 1.0
+            cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
+
+            return (
+                latents,
+                cond_latents,
+                cond_mask,
+                cond_indicator,
+            )
+
+    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput | None = None,
+        video: List[PipelineImageInput] | None = None,
+        prompt: Union[str, List[str]] | None = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 704,
+        width: int = 1280,
+        num_frames: int = 93,
+        num_inference_steps: int = 36,
+        guidance_scale: float = 7.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        conditional_frame_timestep: float = 0.1,
+    ):
+        r"""
+        The call function to the pipeline for generation. Supports three modes:
+
+        - **Text2World**: `image=None`, `video=None`, `prompt` provided. Generates a world clip.
+        - **Image2World**: `image` provided, `video=None`, `prompt` provided. Conditions on a single frame.
+        - **Video2World**: `video` provided, `image=None`, `prompt` provided. Conditions on an input clip.
+
+        Set `num_frames=93` (default) to produce a world video, or `num_frames=1` to produce a single image frame (the
+        above in "*2Image mode").
+
+        Outputs follow `output_type` (e.g., `"pil"` returns a list of `num_frames` PIL images per prompt).
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, *optional*):
+                Optional single image for Image2World conditioning. Must be `None` when `video` is provided.
+            video (`List[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, *optional*):
+                Optional input video for Video2World conditioning. Must be `None` when `image` is provided.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide generation. Required unless `prompt_embeds` is supplied.
+            height (`int`, defaults to `704`):
+                The height in pixels of the generated image.
+            width (`int`, defaults to `1280`):
+                The width in pixels of the generated image.
+            num_frames (`int`, defaults to `93`):
+                Number of output frames. Use `93` for world (video) generation; set to `1` to return a single frame.
+            num_inference_steps (`int`, defaults to `35`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to `7.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
+                the prompt is shorter than this length, it will be padded.
+
+        Examples:
+
+        Returns:
+            [`~CosmosPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
+                the first element is a list with the generated images and the second element is a list of `bool`s
+                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
+        """
+        if self.safety_checker is None:
+            raise ValueError(
+                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
+                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
+                f"Please ensure that you are compliant with the license agreement."
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
+
+        self._guidance_scale = guidance_scale
+        self._current_timestep = None
+        self._interrupt = False
+
+        device = self._execution_device
+
+        if self.safety_checker is not None:
+            self.safety_checker.to(device)
+            if prompt is not None:
+                prompt_list = [prompt] if isinstance(prompt, str) else prompt
+                for p in prompt_list:
+                    if not self.safety_checker.check_text_safety(p):
+                        raise ValueError(
+                            f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
+                            f"prompt abides by the NVIDIA Open Model License Agreement."
+                        )
+
+        # Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            device=device,
+            max_sequence_length=max_sequence_length,
+        )
+
+        vae_dtype = self.vae.dtype
+        transformer_dtype = self.transformer.dtype
+
+        num_frames_in = None
+        if image is not None:
+            if batch_size != 1:
+                raise ValueError(f"batch_size must be 1 for image input (given {batch_size})")
+
+            image = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)
+            video = torch.cat([image, torch.zeros_like(image).repeat(num_frames - 1, 1, 1, 1)], dim=0)
+            video = video.unsqueeze(0)
+            num_frames_in = 1
+        elif video is None:
+            video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
+            num_frames_in = 0
+        else:
+            num_frames_in = len(video)
+
+            if batch_size != 1:
+                raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
+
+        assert video is not None
+        video = self.video_processor.preprocess_video(video, height, width)
+
+        # pad with last frame (for video2world)
+        num_frames_out = num_frames
+        if video.shape[2] < num_frames_out:
+            n_pad_frames = num_frames_out - num_frames_in
+            last_frame = video[0, :, -1:, :, :]  # [C, T==1, H, W]
+            pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1)  # [B, C, T, H, W]
+            video = torch.cat((video, pad_frames), dim=2)
+
+        assert num_frames_in <= num_frames_out, f"expected ({num_frames_in=}) <= ({num_frames_out=})"
+
+        video = video.to(device=device, dtype=vae_dtype)
+
+        num_channels_latents = self.transformer.config.in_channels - 1
+        latents, cond_latent, cond_mask, cond_indicator = self.prepare_latents(
+            video=video,
+            batch_size=batch_size * num_videos_per_prompt,
+            num_channels_latents=num_channels_latents,
+            height=height,
+            width=width,
+            num_frames_in=num_frames_in,
+            num_frames_out=num_frames,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            dtype=torch.float32,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+        cond_timestep = torch.ones_like(cond_indicator) * conditional_frame_timestep
+        cond_mask = cond_mask.to(transformer_dtype)
+
+        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
+
+        # Denoising loop
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        gt_velocity = (latents - cond_latent) * cond_mask
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                self._current_timestep = t.cpu().item()
+
+                # NOTE: assumes sigma(t) \in [0, 1]
+                sigma_t = (
+                    torch.tensor(self.scheduler.sigmas[i].item())
+                    .unsqueeze(0)
+                    .to(device=device, dtype=transformer_dtype)
+                )
+
+                in_latents = cond_mask * cond_latent + (1 - cond_mask) * latents
+                in_latents = in_latents.to(transformer_dtype)
+                in_timestep = cond_indicator * cond_timestep + (1 - cond_indicator) * sigma_t
+                noise_pred = self.transformer(
+                    hidden_states=in_latents,
+                    condition_mask=cond_mask,
+                    timestep=in_timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    padding_mask=padding_mask,
+                    return_dict=False,
+                )[0]
+                # NOTE: replace velocity (noise_pred) with gt_velocity for conditioning inputs only
+                noise_pred = gt_velocity + noise_pred * (1 - cond_mask)
+
+                if self.do_classifier_free_guidance:
+                    noise_pred_neg = self.transformer(
+                        hidden_states=in_latents,
+                        condition_mask=cond_mask,
+                        timestep=in_timestep,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        padding_mask=padding_mask,
+                        return_dict=False,
+                    )[0]
+                    # NOTE: replace velocity (noise_pred_neg) with gt_velocity for conditioning inputs only
+                    noise_pred_neg = gt_velocity + noise_pred_neg * (1 - cond_mask)
+                    noise_pred = noise_pred + self.guidance_scale * (noise_pred - noise_pred_neg)
+
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        self._current_timestep = None
+
+        if not output_type == "latent":
+            latents_mean = self.latents_mean.to(latents.device, latents.dtype)
+            latents_std = self.latents_std.to(latents.device, latents.dtype)
+            latents = latents * latents_std + latents_mean
+            video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
+            video = self._match_num_frames(video, num_frames)
+
+            assert self.safety_checker is not None
+            self.safety_checker.to(device)
+            video = self.video_processor.postprocess_video(video, output_type="np")
+            video = (video * 255).astype(np.uint8)
+            video_batch = []
+            for vid in video:
+                vid = self.safety_checker.check_video_safety(vid)
+                video_batch.append(vid)
+            video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
+            video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CosmosPipelineOutput(frames=video)
+
+    def _match_num_frames(self, video: torch.Tensor, target_num_frames: int) -> torch.Tensor:
+        if target_num_frames <= 0 or video.shape[2] == target_num_frames:
+            return video
+
+        frames_per_latent = max(self.vae_scale_factor_temporal, 1)
+        video = torch.repeat_interleave(video, repeats=frames_per_latent, dim=2)
+
+        current_frames = video.shape[2]
+        if current_frames < target_num_frames:
+            pad = video[:, :, -1:, :, :].repeat(1, 1, target_num_frames - current_frames, 1, 1)
+            video = torch.cat([video, pad], dim=2)
+        elif current_frames > target_num_frames:
+            video = video[:, :, :target_num_frames]
+
+        return video
diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py
index 689c6a0635..5ea56b300b 100644
--- a/src/diffusers/schedulers/scheduling_unipc_multistep.py
+++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -217,6 +217,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
         rescale_betas_zero_snr: bool = False,
         use_dynamic_shifting: bool = False,
         time_shift_type: Literal["exponential"] = "exponential",
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
     ) -> None:
         if self.config.use_beta_sigmas and not is_scipy_available():
             raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -350,7 +352,12 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
             log_sigmas = np.log(sigmas)
             sigmas = np.flip(sigmas).copy()
             sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
-            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            if self.config.use_flow_sigmas:
+                sigmas = sigmas / (sigmas + 1)
+                timesteps = (sigmas * self.config.num_train_timesteps).copy()
+            else:
+                timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+
             if self.config.final_sigmas_type == "sigma_min":
                 sigma_last = sigmas[-1]
             elif self.config.final_sigmas_type == "zero":
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 74a4146bfd..4e1eae211c 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -767,6 +767,21 @@ class ConsisIDPipeline(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class Cosmos2_5_PredictBasePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class Cosmos2TextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/pipelines/cosmos/cosmos_guardrail.py b/tests/pipelines/cosmos/cosmos_guardrail.py
index 4de14fbaaf..c9ef597fdb 100644
--- a/tests/pipelines/cosmos/cosmos_guardrail.py
+++ b/tests/pipelines/cosmos/cosmos_guardrail.py
@@ -27,7 +27,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
     def __init__(self) -> None:
         super().__init__()
 
-        self._dtype = torch.float32
+        self.register_buffer("_device_tracker", torch.zeros(1, dtype=torch.float32), persistent=False)
 
     def check_text_safety(self, prompt: str) -> bool:
         return True
@@ -35,13 +35,14 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
     def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
         return frames
 
-    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None) -> None:
-        self._dtype = dtype
+    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
+        module = super().to(device=device, dtype=dtype)
+        return module
 
     @property
     def device(self) -> torch.device:
-        return None
+        return self._device_tracker.device
 
     @property
     def dtype(self) -> torch.dtype:
-        return self._dtype
+        return self._device_tracker.dtype
diff --git a/tests/pipelines/cosmos/test_cosmos2_5_predict.py b/tests/pipelines/cosmos/test_cosmos2_5_predict.py
new file mode 100644
index 0000000000..54d4edb485
--- /dev/null
+++ b/tests/pipelines/cosmos/test_cosmos2_5_predict.py
@@ -0,0 +1,337 @@
+# Copyright 2025 The HuggingFace Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer
+
+from diffusers import (
+    AutoencoderKLWan,
+    Cosmos2_5_PredictBasePipeline,
+    CosmosTransformer3DModel,
+    UniPCMultistepScheduler,
+)
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin, to_np
+from .cosmos_guardrail import DummyCosmosSafetyChecker
+
+
+enable_full_determinism()
+
+
+class Cosmos2_5_PredictBaseWrapper(Cosmos2_5_PredictBasePipeline):
+    @staticmethod
+    def from_pretrained(*args, **kwargs):
+        if "safety_checker" not in kwargs or kwargs["safety_checker"] is None:
+            safety_checker = DummyCosmosSafetyChecker()
+            device_map = kwargs.get("device_map", "cpu")
+            torch_dtype = kwargs.get("torch_dtype")
+            if device_map is not None or torch_dtype is not None:
+                safety_checker = safety_checker.to(device_map, dtype=torch_dtype)
+            kwargs["safety_checker"] = safety_checker
+        return Cosmos2_5_PredictBasePipeline.from_pretrained(*args, **kwargs)
+
+
+class Cosmos2_5_PredictPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = Cosmos2_5_PredictBaseWrapper
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    supports_dduf = False
+    test_xformers_attention = False
+    test_layerwise_casting = True
+    test_group_offloading = True
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        transformer = CosmosTransformer3DModel(
+            in_channels=16 + 1,
+            out_channels=16,
+            num_attention_heads=2,
+            attention_head_dim=16,
+            num_layers=2,
+            mlp_ratio=2,
+            text_embed_dim=32,
+            adaln_lora_dim=4,
+            max_size=(4, 32, 32),
+            patch_size=(1, 2, 2),
+            rope_scale=(2.0, 1.0, 1.0),
+            concat_padding_mask=True,
+            extra_pos_embed_type="learnable",
+        )
+
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = UniPCMultistepScheduler()
+
+        torch.manual_seed(0)
+        config = Qwen2_5_VLConfig(
+            text_config={
+                "hidden_size": 16,
+                "intermediate_size": 16,
+                "num_hidden_layers": 2,
+                "num_attention_heads": 2,
+                "num_key_value_heads": 2,
+                "rope_scaling": {
+                    "mrope_section": [1, 1, 2],
+                    "rope_type": "default",
+                    "type": "default",
+                },
+                "rope_theta": 1000000.0,
+            },
+            vision_config={
+                "depth": 2,
+                "hidden_size": 16,
+                "intermediate_size": 16,
+                "num_heads": 2,
+                "out_hidden_size": 16,
+            },
+            hidden_size=16,
+            vocab_size=152064,
+            vision_end_token_id=151653,
+            vision_start_token_id=151652,
+            vision_token_id=151654,
+        )
+        text_encoder = Qwen2_5_VLForConditionalGeneration(config)
+        tokenizer = Qwen2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": DummyCosmosSafetyChecker(),
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "bad quality",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 3.0,
+            "height": 32,
+            "width": 32,
+            "num_frames": 3,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+
+        return inputs
+
+    def test_components_function(self):
+        init_components = self.get_dummy_components()
+        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
+        pipe = self.pipeline_class(**init_components)
+        self.assertTrue(hasattr(pipe, "components"))
+        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+        self.assertEqual(generated_video.shape, (3, 3, 32, 32))
+        self.assertTrue(torch.isfinite(generated_video).all())
+
+    def test_callback_inputs(self):
+        sig = inspect.signature(self.pipeline_class.__call__)
+        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
+        has_callback_step_end = "callback_on_step_end" in sig.parameters
+
+        if not (has_callback_tensor_inputs and has_callback_step_end):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe = pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+        self.assertTrue(
+            hasattr(pipe, "_callback_tensor_inputs"),
+            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
+        )
+
+        def callback_inputs_subset(pipe, i, t, callback_kwargs):
+            for tensor_name in callback_kwargs.keys():
+                assert tensor_name in pipe._callback_tensor_inputs
+            return callback_kwargs
+
+        def callback_inputs_all(pipe, i, t, callback_kwargs):
+            for tensor_name in pipe._callback_tensor_inputs:
+                assert tensor_name in callback_kwargs
+            for tensor_name in callback_kwargs.keys():
+                assert tensor_name in pipe._callback_tensor_inputs
+            return callback_kwargs
+
+        inputs = self.get_dummy_inputs(torch_device)
+
+        inputs["callback_on_step_end"] = callback_inputs_subset
+        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
+        _ = pipe(**inputs)[0]
+
+        inputs["callback_on_step_end"] = callback_inputs_all
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        _ = pipe(**inputs)[0]
+
+        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
+            is_last = i == (pipe.num_timesteps - 1)
+            if is_last:
+                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
+            return callback_kwargs
+
+        inputs["callback_on_step_end"] = callback_inputs_change_tensor
+        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
+        output = pipe(**inputs)[0]
+        assert output.abs().sum() < 1e10
+
+    def test_inference_batch_single_identical(self):
+        self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
+
+    def test_attention_slicing_forward_pass(
+        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
+    ):
+        if not getattr(self, "test_attention_slicing", True):
+            return
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        output_without_slicing = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=1)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing1 = pipe(**inputs)[0]
+
+        pipe.enable_attention_slicing(slice_size=2)
+        inputs = self.get_dummy_inputs(generator_device)
+        output_with_slicing2 = pipe(**inputs)[0]
+
+        if test_max_difference:
+            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
+            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
+            self.assertLess(
+                max(max_diff1, max_diff2),
+                expected_max_diff,
+                "Attention slicing should not affect the inference results",
+            )
+
+    def test_save_load_optional_components(self, expected_max_difference=1e-4):
+        self.pipeline_class._optional_components.remove("safety_checker")
+        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
+        self.pipeline_class._optional_components.append("safety_checker")
+
+    def test_serialization_with_variants(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        model_components = [
+            component_name
+            for component_name, component in pipe.components.items()
+            if isinstance(component, torch.nn.Module)
+        ]
+        model_components.remove("safety_checker")
+        variant = "fp16"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
+
+            with open(f"{tmpdir}/model_index.json", "r") as f:
+                config = json.load(f)
+
+            for subfolder in os.listdir(tmpdir):
+                if not os.path.isfile(subfolder) and subfolder in model_components:
+                    folder_path = os.path.join(tmpdir, subfolder)
+                    is_folder = os.path.isdir(folder_path) and subfolder in config
+                    assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
+
+    def test_torch_dtype_dict(self):
+        components = self.get_dummy_components()
+        if not components:
+            self.skipTest("No dummy components defined.")
+
+        pipe = self.pipeline_class(**components)
+
+        specified_key = next(iter(components.keys()))
+
+        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
+            pipe.save_pretrained(tmpdirname, safe_serialization=False)
+            torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
+            loaded_pipe = self.pipeline_class.from_pretrained(
+                tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
+            )
+
+        for name, component in loaded_pipe.components.items():
+            if name == "safety_checker":
+                continue
+            if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
+                expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
+                self.assertEqual(
+                    component.dtype,
+                    expected_dtype,
+                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
+                )
+
+    @unittest.skip(
+        "The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
+        "a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
+        "too large and slow to run on CI."
+    )
+    def test_encode_prompt_works_in_isolation(self):
+        pass
diff --git a/tests/schedulers/test_scheduler_unipc.py b/tests/schedulers/test_scheduler_unipc.py
index 197c831cb0..ac7e1d3f88 100644
--- a/tests/schedulers/test_scheduler_unipc.py
+++ b/tests/schedulers/test_scheduler_unipc.py
@@ -399,3 +399,32 @@ class UniPCMultistepScheduler1DTest(UniPCMultistepSchedulerTest):
 
     def test_exponential_sigmas(self):
         self.check_over_configs(use_exponential_sigmas=True)
+
+    def test_flow_and_karras_sigmas(self):
+        self.check_over_configs(use_flow_sigmas=True, use_karras_sigmas=True)
+
+    def test_flow_and_karras_sigmas_values(self):
+        num_train_timesteps = 1000
+        num_inference_steps = 5
+        scheduler = UniPCMultistepScheduler(
+            sigma_min=0.01,
+            sigma_max=200.0,
+            use_flow_sigmas=True,
+            use_karras_sigmas=True,
+            num_train_timesteps=num_train_timesteps,
+        )
+        scheduler.set_timesteps(num_inference_steps=num_inference_steps)
+
+        expected_sigmas = [
+            0.9950248599052429,
+            0.9787454605102539,
+            0.8774884343147278,
+            0.3604971766471863,
+            0.009900986216962337,
+            0.0,  # 0 appended as default
+        ]
+        expected_sigmas = torch.tensor(expected_sigmas)
+        expected_timesteps = (expected_sigmas * num_train_timesteps).to(torch.int64)
+        expected_timesteps = expected_timesteps[0:-1]
+        self.assertTrue(torch.allclose(scheduler.sigmas, expected_sigmas))
+        self.assertTrue(torch.all(expected_timesteps == scheduler.timesteps))

From f7753b1bc8b4b3b97dc7f71d51ccb3a281b17b48 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Thu, 18 Dec 2025 19:25:20 -1000
Subject: [PATCH 10/11] more update in modular  (#12560)

* move node registry to mellon

* up

* fix

* modula rpipeline update: filter out none for input_names, fix default blocks for pipe.init() and allow user pass additional kwargs_type in a dict

* qwen modular refactor, unpack before decode

* update mellon node config, adding* to required_inputs and required_model_inputs

* modularpipeline.from_pretrained: error out if no config found

* add a component_names property to modular blocks to be consistent!

* flux image_encoder -> vae_encoder

* controlnet_bundle

* refator MellonNodeConfig MellonPipelineConfig

* refactor & simplify mellon utils

* vae_image_encoder -> vae_encoder

* mellon config save keep key order

* style + copies

* add kwargs input for zimage
---
 .../modular_pipelines/flux/modular_blocks.py  |    4 +-
 .../modular_pipelines/mellon_node_utils.py    | 1141 +++++++----------
 .../modular_pipelines/modular_pipeline.py     |   25 +-
 .../modular_pipelines/qwenimage/decoders.py   |   54 +-
 .../qwenimage/modular_blocks.py               |   29 +-
 .../modular_pipelines/qwenimage/node_utils.py |   95 --
 .../stable_diffusion_xl/node_utils.py         |   99 --
 .../modular_pipelines/z_image/denoise.py      |    4 +
 .../z_image/modular_blocks.py                 |    8 +-
 9 files changed, 585 insertions(+), 874 deletions(-)
 delete mode 100644 src/diffusers/modular_pipelines/qwenimage/node_utils.py
 delete mode 100644 src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py

diff --git a/src/diffusers/modular_pipelines/flux/modular_blocks.py b/src/diffusers/modular_pipelines/flux/modular_blocks.py
index a80bc2a5f7..bd9b2d1b40 100644
--- a/src/diffusers/modular_pipelines/flux/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux/modular_blocks.py
@@ -360,7 +360,7 @@ class FluxKontextCoreDenoiseStep(SequentialPipelineBlocks):
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", FluxTextEncoderStep()),
-        ("image_encoder", FluxAutoVaeEncoderStep()),
+        ("vae_encoder", FluxAutoVaeEncoderStep()),
         ("denoise", FluxCoreDenoiseStep()),
         ("decode", FluxDecodeStep()),
     ]
@@ -369,7 +369,7 @@ AUTO_BLOCKS = InsertableDict(
 AUTO_BLOCKS_KONTEXT = InsertableDict(
     [
         ("text_encoder", FluxTextEncoderStep()),
-        ("image_encoder", FluxKontextAutoVaeEncoderStep()),
+        ("vae_encoder", FluxKontextAutoVaeEncoderStep()),
         ("denoise", FluxKontextCoreDenoiseStep()),
         ("decode", FluxDecodeStep()),
     ]
diff --git a/src/diffusers/modular_pipelines/mellon_node_utils.py b/src/diffusers/modular_pipelines/mellon_node_utils.py
index a405aebee2..4f142a453f 100644
--- a/src/diffusers/modular_pipelines/mellon_node_utils.py
+++ b/src/diffusers/modular_pipelines/mellon_node_utils.py
@@ -4,315 +4,31 @@ import os
 
 # Simple typed wrapper for parameter overrides
 from dataclasses import asdict, dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union
 
-from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub import create_repo, hf_hub_download, upload_folder
 from huggingface_hub.utils import (
     EntryNotFoundError,
     HfHubHTTPError,
     RepositoryNotFoundError,
     RevisionNotFoundError,
-    validate_hf_hub_args,
 )
 
-from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT, PushToHubMixin, extract_commit_hash
-from .modular_pipeline import ModularPipelineBlocks
+from ..utils import HUGGINGFACE_CO_RESOLVE_ENDPOINT
 
 
 logger = logging.getLogger(__name__)
 
 
-SUPPORTED_NODE_TYPES = {"controlnet", "vae_encoder", "denoise", "text_encoder", "decoder"}
-
-
-# Mellon Input Parameters (runtime parameters, not models)
-MELLON_INPUT_PARAMS = {
-    # controlnet
-    "control_image": {
-        "label": "Control Image",
-        "type": "image",
-        "display": "input",
-    },
-    "controlnet_conditioning_scale": {
-        "label": "Scale",
-        "type": "float",
-        "default": 0.5,
-        "min": 0,
-        "max": 1,
-    },
-    "control_guidance_end": {
-        "label": "End",
-        "type": "float",
-        "default": 1.0,
-        "min": 0,
-        "max": 1,
-    },
-    "control_guidance_start": {
-        "label": "Start",
-        "type": "float",
-        "default": 0.0,
-        "min": 0,
-        "max": 1,
-    },
-    "controlnet": {
-        "label": "Controlnet",
-        "type": "custom_controlnet",
-        "display": "input",
-    },
-    "embeddings": {
-        "label": "Text Embeddings",
-        "display": "input",
-        "type": "embeddings",
-    },
-    "image": {
-        "label": "Image",
-        "type": "image",
-        "display": "input",
-    },
-    "negative_prompt": {
-        "label": "Negative Prompt",
-        "type": "string",
-        "default": "",
-        "display": "textarea",
-    },
-    "prompt": {
-        "label": "Prompt",
-        "type": "string",
-        "default": "",
-        "display": "textarea",
-    },
-    "guidance_scale": {
-        "label": "Guidance Scale",
-        "type": "float",
-        "display": "slider",
-        "default": 5,
-        "min": 1.0,
-        "max": 30.0,
-        "step": 0.1,
-    },
-    "height": {
-        "label": "Height",
-        "type": "int",
-        "default": 1024,
-        "min": 64,
-        "step": 8,
-    },
-    "image_latents": {
-        "label": "Image Latents",
-        "type": "latents",
-        "display": "input",
-        "onChange": {False: ["height", "width"], True: ["strength"]},
-    },
-    "latents": {
-        "label": "Latents",
-        "type": "latents",
-        "display": "input",
-    },
-    "num_inference_steps": {
-        "label": "Steps",
-        "type": "int",
-        "display": "slider",
-        "default": 25,
-        "min": 1,
-        "max": 100,
-    },
-    "seed": {
-        "label": "Seed",
-        "type": "int",
-        "display": "random",
-        "default": 0,
-        "min": 0,
-        "max": 4294967295,
-    },
-    "strength": {
-        "label": "Strength",
-        "type": "float",
-        "default": 0.5,
-        "min": 0.0,
-        "max": 1.0,
-        "step": 0.01,
-    },
-    "width": {
-        "label": "Width",
-        "type": "int",
-        "default": 1024,
-        "min": 64,
-        "step": 8,
-    },
-    "ip_adapter": {
-        "label": "IP Adapter",
-        "type": "custom_ip_adapter",
-        "display": "input",
-    },
-}
-
-# Mellon Model Parameters (diffusers_auto_model types)
-MELLON_MODEL_PARAMS = {
-    "scheduler": {
-        "label": "Scheduler",
-        "display": "input",
-        "type": "diffusers_auto_model",
-    },
-    "text_encoders": {
-        "label": "Text Encoders",
-        "type": "diffusers_auto_models",
-        "display": "input",
-    },
-    "unet": {
-        "label": "Unet",
-        "display": "input",
-        "type": "diffusers_auto_model",
-        "onSignal": {
-            "action": "signal",
-            "target": "guider",
-        },
-    },
-    "guider": {
-        "label": "Guider",
-        "display": "input",
-        "type": "custom_guider",
-        "onChange": {False: ["guidance_scale"], True: []},
-    },
-    "vae": {
-        "label": "VAE",
-        "display": "input",
-        "type": "diffusers_auto_model",
-    },
-    "controlnet": {
-        "label": "Controlnet Model",
-        "type": "diffusers_auto_model",
-        "display": "input",
-    },
-}
-
-# Mellon Output Parameters (display = "output")
-MELLON_OUTPUT_PARAMS = {
-    "embeddings": {
-        "label": "Text Embeddings",
-        "display": "output",
-        "type": "embeddings",
-    },
-    "images": {
-        "label": "Images",
-        "type": "image",
-        "display": "output",
-    },
-    "image_latents": {
-        "label": "Image Latents",
-        "type": "latents",
-        "display": "output",
-    },
-    "latents": {
-        "label": "Latents",
-        "type": "latents",
-        "display": "output",
-    },
-    "latents_preview": {
-        "label": "Latents Preview",
-        "display": "output",
-        "type": "latent",
-    },
-    "controlnet_out": {
-        "label": "Controlnet",
-        "display": "output",
-        "type": "controlnet",
-    },
-}
-
-
-# Default param selections per supported node_type
-# from MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS.
-NODE_TYPE_PARAMS_MAP = {
-    "controlnet": {
-        "inputs": [
-            "control_image",
-            "controlnet_conditioning_scale",
-            "control_guidance_start",
-            "control_guidance_end",
-            "height",
-            "width",
-        ],
-        "model_inputs": [
-            "controlnet",
-            "vae",
-        ],
-        "outputs": [
-            "controlnet",
-        ],
-        "block_names": ["controlnet_vae_encoder"],
-    },
-    "denoise": {
-        "inputs": [
-            "embeddings",
-            "width",
-            "height",
-            "seed",
-            "num_inference_steps",
-            "guidance_scale",
-            "image_latents",
-            "strength",
-            # custom adapters coming in as inputs
-            "controlnet",
-            # ip_adapter is optional and custom; include if available
-            "ip_adapter",
-        ],
-        "model_inputs": [
-            "unet",
-            "guider",
-            "scheduler",
-        ],
-        "outputs": [
-            "latents",
-            "latents_preview",
-        ],
-        "block_names": ["denoise"],
-    },
-    "vae_encoder": {
-        "inputs": [
-            "image",
-            "width",
-            "height",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "image_latents",
-        ],
-        "block_names": ["vae_encoder"],
-    },
-    "text_encoder": {
-        "inputs": [
-            "prompt",
-            "negative_prompt",
-            # optional image prompt input supported in embeddings node
-            "image",
-        ],
-        "model_inputs": [
-            "text_encoders",
-        ],
-        "outputs": [
-            "embeddings",
-        ],
-        "block_names": ["text_encoder"],
-    },
-    "decoder": {
-        "inputs": [
-            "latents",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "images",
-        ],
-        "block_names": ["decode"],
-    },
-}
-
-
 @dataclass(frozen=True)
 class MellonParam:
+    """
+    Parameter definition for Mellon nodes.
+
+    Use factory methods for common params (e.g., MellonParam.seed()) or create custom ones with MellonParam(name="...",
+    label="...", type="...").
+    """
+
     name: str
     label: str
     type: str
@@ -326,122 +42,482 @@ class MellonParam:
     fieldOptions: Optional[Dict[str, Any]] = None
     onChange: Any = None
     onSignal: Any = None
-    _map_to_input: Any = None  # the block input name this parameter maps to
 
     def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict for Mellon schema, excluding None values and name."""
         data = asdict(self)
-        return {k: v for k, v in data.items() if not k.startswith("_") and v is not None}
-
-
-@dataclass
-class MellonNodeConfig(PushToHubMixin):
-    """
-    A MellonNodeConfig is a base class to build Mellon nodes UI with modular diffusers.
-
-    <Tip warning={true}>
-
-        This is an experimental feature and is likely to change in the future.
-
-    </Tip>
-    """
-
-    inputs: List[Union[str, MellonParam]]
-    model_inputs: List[Union[str, MellonParam]]
-    outputs: List[Union[str, MellonParam]]
-    blocks_names: list[str]
-    node_type: str
-    config_name = "mellon_config.json"
-
-    def __post_init__(self):
-        if isinstance(self.inputs, list):
-            self.inputs = self._resolve_params_list(self.inputs, MELLON_INPUT_PARAMS)
-        if isinstance(self.model_inputs, list):
-            self.model_inputs = self._resolve_params_list(self.model_inputs, MELLON_MODEL_PARAMS)
-        if isinstance(self.outputs, list):
-            self.outputs = self._resolve_params_list(self.outputs, MELLON_OUTPUT_PARAMS)
-
-    @staticmethod
-    def _resolve_params_list(
-        params: List[Union[str, MellonParam]], default_map: Dict[str, Dict[str, Any]]
-    ) -> Dict[str, Dict[str, Any]]:
-        def _resolve_param(
-            param: Union[str, MellonParam], default_params_map: Dict[str, Dict[str, Any]]
-        ) -> Tuple[str, Dict[str, Any]]:
-            if isinstance(param, str):
-                if param not in default_params_map:
-                    raise ValueError(f"Unknown param '{param}', please define a `MellonParam` object instead")
-                return param, default_params_map[param].copy()
-            elif isinstance(param, MellonParam):
-                param_dict = param.to_dict()
-                param_name = param_dict.pop("name")
-                return param_name, param_dict
-            else:
-                raise ValueError(
-                    f"Unknown param type '{type(param)}', please use a string or a  `MellonParam` object instead"
-                )
-
-        resolved = {}
-        for p in params:
-            logger.info(f" Resolving param: {p}")
-            name, cfg = _resolve_param(p, default_map)
-            if name in resolved:
-                raise ValueError(f"Duplicate param '{name}'")
-            resolved[name] = cfg
-        return resolved
+        return {k: v for k, v in data.items() if v is not None and k != "name"}
 
     @classmethod
-    @validate_hf_hub_args
-    def load_mellon_config(
+    def image(cls) -> "MellonParam":
+        return cls(name="image", label="Image", type="image", display="input")
+
+    @classmethod
+    def images(cls) -> "MellonParam":
+        return cls(name="images", label="Images", type="image", display="output")
+
+    @classmethod
+    def control_image(cls, display: str = "input") -> "MellonParam":
+        return cls(name="control_image", label="Control Image", type="image", display=display)
+
+    @classmethod
+    def latents(cls, display: str = "input") -> "MellonParam":
+        return cls(name="latents", label="Latents", type="latents", display=display)
+
+    @classmethod
+    def image_latents(cls, display: str = "input") -> "MellonParam":
+        return cls(name="image_latents", label="Image Latents", type="latents", display=display)
+
+    @classmethod
+    def image_latents_with_strength(cls) -> "MellonParam":
+        return cls(
+            name="image_latents",
+            label="Image Latents",
+            type="latents",
+            display="input",
+            onChange={"false": ["height", "width"], "true": ["strength"]},
+        )
+
+    @classmethod
+    def latents_preview(cls) -> "MellonParam":
+        """
+        `Latents Preview` is a special output parameter that is used to preview the latents in the UI.
+        """
+        return cls(name="latents_preview", label="Latents Preview", type="latent", display="output")
+
+    @classmethod
+    def embeddings(cls, display: str = "output") -> "MellonParam":
+        return cls(name="embeddings", label="Text Embeddings", type="embeddings", display=display)
+
+    @classmethod
+    def controlnet_conditioning_scale(cls, default: float = 0.5) -> "MellonParam":
+        return cls(
+            name="controlnet_conditioning_scale",
+            label="Controlnet Conditioning Scale",
+            type="float",
+            default=default,
+            min=0.0,
+            max=1.0,
+            step=0.01,
+        )
+
+    @classmethod
+    def control_guidance_start(cls, default: float = 0.0) -> "MellonParam":
+        return cls(
+            name="control_guidance_start",
+            label="Control Guidance Start",
+            type="float",
+            default=default,
+            min=0.0,
+            max=1.0,
+            step=0.01,
+        )
+
+    @classmethod
+    def control_guidance_end(cls, default: float = 1.0) -> "MellonParam":
+        return cls(
+            name="control_guidance_end",
+            label="Control Guidance End",
+            type="float",
+            default=default,
+            min=0.0,
+            max=1.0,
+            step=0.01,
+        )
+
+    @classmethod
+    def prompt(cls, default: str = "") -> "MellonParam":
+        return cls(name="prompt", label="Prompt", type="string", default=default, display="textarea")
+
+    @classmethod
+    def negative_prompt(cls, default: str = "") -> "MellonParam":
+        return cls(name="negative_prompt", label="Negative Prompt", type="string", default=default, display="textarea")
+
+    @classmethod
+    def strength(cls, default: float = 0.5) -> "MellonParam":
+        return cls(name="strength", label="Strength", type="float", default=default, min=0.0, max=1.0, step=0.01)
+
+    @classmethod
+    def guidance_scale(cls, default: float = 5.0) -> "MellonParam":
+        return cls(
+            name="guidance_scale",
+            label="Guidance Scale",
+            type="float",
+            display="slider",
+            default=default,
+            min=1.0,
+            max=30.0,
+            step=0.1,
+        )
+
+    @classmethod
+    def height(cls, default: int = 1024) -> "MellonParam":
+        return cls(name="height", label="Height", type="int", default=default, min=64, step=8)
+
+    @classmethod
+    def width(cls, default: int = 1024) -> "MellonParam":
+        return cls(name="width", label="Width", type="int", default=default, min=64, step=8)
+
+    @classmethod
+    def seed(cls, default: int = 0) -> "MellonParam":
+        return cls(name="seed", label="Seed", type="int", default=default, min=0, max=4294967295, display="random")
+
+    @classmethod
+    def num_inference_steps(cls, default: int = 25) -> "MellonParam":
+        return cls(
+            name="num_inference_steps", label="Steps", type="int", default=default, min=1, max=100, display="slider"
+        )
+
+    @classmethod
+    def vae(cls) -> "MellonParam":
+        """
+        VAE model info dict.
+
+        Contains keys like 'model_id', 'repo_id', 'execution_device' etc. Use components.get_one(model_id) to retrieve
+        the actual model.
+        """
+        return cls(name="vae", label="VAE", type="diffusers_auto_model", display="input")
+
+    @classmethod
+    def unet(cls) -> "MellonParam":
+        """
+        Denoising model (UNet/Transformer) info dict.
+
+        Contains keys like 'model_id', 'repo_id', 'execution_device' etc. Use components.get_one(model_id) to retrieve
+        the actual model.
+        """
+        return cls(name="unet", label="Denoise Model", type="diffusers_auto_model", display="input")
+
+    @classmethod
+    def scheduler(cls) -> "MellonParam":
+        """
+        Scheduler model info dict.
+
+        Contains keys like 'model_id', 'repo_id' etc. Use components.get_one(model_id) to retrieve the actual
+        scheduler.
+        """
+        return cls(name="scheduler", label="Scheduler", type="diffusers_auto_model", display="input")
+
+    @classmethod
+    def controlnet(cls) -> "MellonParam":
+        """
+        ControlNet model info dict.
+
+        Contains keys like 'model_id', 'repo_id', 'execution_device' etc. Use components.get_one(model_id) to retrieve
+        the actual model.
+        """
+        return cls(name="controlnet", label="ControlNet Model", type="diffusers_auto_model", display="input")
+
+    @classmethod
+    def text_encoders(cls) -> "MellonParam":
+        """
+        Dict of text encoder model info dicts.
+
+        Structure: {
+            'text_encoder': {'model_id': ..., 'execution_device': ..., ...}, 'tokenizer': {'model_id': ..., ...},
+            'repo_id': '...'
+        } Use components.get_one(model_id) to retrieve each model.
+        """
+        return cls(name="text_encoders", label="Text Encoders", type="diffusers_auto_models", display="input")
+
+    @classmethod
+    def controlnet_bundle(cls, display: str = "input") -> "MellonParam":
+        """
+        ControlNet bundle containing model info and processed control inputs.
+
+        Structure: {
+            'controlnet': {'model_id': ..., ...}, # controlnet model info dict 'control_image': ..., # processed
+            control image/embeddings 'controlnet_conditioning_scale': ..., ... # other inputs expected by denoise
+            blocks
+        }
+
+        Output from Controlnet node, input to Denoise node.
+        """
+        return cls(name="controlnet_bundle", label="ControlNet", type="custom_controlnet", display=display)
+
+    @classmethod
+    def ip_adapter(cls) -> "MellonParam":
+        return cls(name="ip_adapter", label="IP Adapter", type="custom_ip_adapter", display="input")
+
+    @classmethod
+    def guider(cls) -> "MellonParam":
+        return cls(
+            name="guider",
+            label="Guider",
+            type="custom_guider",
+            display="input",
+            onChange={False: ["guidance_scale"], True: []},
+        )
+
+    @classmethod
+    def doc(cls) -> "MellonParam":
+        return cls(name="doc", label="Doc", type="string", display="output")
+
+
+def mark_required(label: str, marker: str = " *") -> str:
+    """Add required marker to label if not already present."""
+    if label.endswith(marker):
+        return label
+    return f"{label}{marker}"
+
+
+def node_spec_to_mellon_dict(node_spec: Dict[str, Any], node_type: str) -> Dict[str, Any]:
+    """
+    Convert a node spec dict into Mellon format.
+
+    A node spec is how we define a Mellon diffusers node in code. This function converts it into the `params` map
+    format that Mellon UI expects.
+
+    The `params` map is a dict where keys are parameter names and values are UI configuration:
+        ```python
+        {"seed": {"label": "Seed", "type": "int", "default": 0}}
+        ```
+
+    For Modular Mellon nodes, we need to distinguish:
+        - `inputs`: Pipeline inputs (e.g., seed, prompt, image)
+        - `model_inputs`: Model components (e.g., unet, vae, scheduler)
+        - `outputs`: Node outputs (e.g., latents, images)
+
+    The node spec also includes:
+        - `required_inputs` / `required_model_inputs`: Which params are required (marked with *)
+        - `block_name`: The modular pipeline block this node corresponds to on backend
+
+    We provide factory methods for common parameters (e.g., `MellonParam.seed()`, `MellonParam.unet()`) so you don't
+    have to manually specify all the UI configuration.
+
+    Args:
+        node_spec: Dict with `inputs`, `model_inputs`, `outputs` (lists of MellonParam),
+                   plus `required_inputs`, `required_model_inputs`, `block_name`.
+        node_type: The node type string (e.g., "denoise", "controlnet")
+
+    Returns:
+        Dict with:
+            - `params`: Flat dict of all params in Mellon UI format
+            - `input_names`: List of input parameter names
+            - `model_input_names`: List of model input parameter names
+            - `output_names`: List of output parameter names
+            - `block_name`: The backend block name
+            - `node_type`: The node type
+
+    Example:
+        ```python
+        node_spec = {
+            "inputs": [MellonParam.seed(), MellonParam.prompt()],
+            "model_inputs": [MellonParam.unet()],
+            "outputs": [MellonParam.latents(display="output")],
+            "required_inputs": ["prompt"],
+            "required_model_inputs": ["unet"],
+            "block_name": "denoise",
+        }
+
+        result = node_spec_to_mellon_dict(node_spec, "denoise")
+        # Returns:
+        # {
+        #     "params": {
+        #         "seed": {"label": "Seed", "type": "int", "default": 0},
+        #         "prompt": {"label": "Prompt *", "type": "string", "default": ""},  # * marks required
+        #         "unet": {"label": "Denoise Model *", "type": "diffusers_auto_model", "display": "input"},
+        #         "latents": {"label": "Latents", "type": "latents", "display": "output"},
+        #     },
+        #     "input_names": ["seed", "prompt"],
+        #     "model_input_names": ["unet"],
+        #     "output_names": ["latents"],
+        #     "block_name": "denoise",
+        #     "node_type": "denoise",
+        # }
+        ```
+    """
+    params = {}
+    input_names = []
+    model_input_names = []
+    output_names = []
+
+    required_inputs = node_spec.get("required_inputs", [])
+    required_model_inputs = node_spec.get("required_model_inputs", [])
+
+    # Process inputs
+    for p in node_spec.get("inputs", []):
+        param_dict = p.to_dict()
+        if p.name in required_inputs:
+            param_dict["label"] = mark_required(param_dict["label"])
+        params[p.name] = param_dict
+        input_names.append(p.name)
+
+    # Process model_inputs
+    for p in node_spec.get("model_inputs", []):
+        param_dict = p.to_dict()
+        if p.name in required_model_inputs:
+            param_dict["label"] = mark_required(param_dict["label"])
+        params[p.name] = param_dict
+        model_input_names.append(p.name)
+
+    # Process outputs
+    for p in node_spec.get("outputs", []):
+        params[p.name] = p.to_dict()
+        output_names.append(p.name)
+
+    return {
+        "params": params,
+        "input_names": input_names,
+        "model_input_names": model_input_names,
+        "output_names": output_names,
+        "block_name": node_spec.get("block_name"),
+        "node_type": node_type,
+    }
+
+
+class MellonPipelineConfig:
+    """
+    Configuration for an entire Mellon pipeline containing multiple nodes.
+
+    Accepts node specs as dicts with inputs/model_inputs/outputs lists of MellonParam, converts them to Mellon-ready
+    format, and handles save/load to Hub.
+
+    Example:
+        ```python
+        config = MellonPipelineConfig(
+            node_specs={
+                "denoise": {
+                    "inputs": [MellonParam.seed(), MellonParam.prompt()],
+                    "model_inputs": [MellonParam.unet()],
+                    "outputs": [MellonParam.latents(display="output")],
+                    "required_inputs": ["prompt"],
+                    "required_model_inputs": ["unet"],
+                    "block_name": "denoise",
+                },
+                "decoder": {
+                    "inputs": [MellonParam.latents(display="input")],
+                    "outputs": [MellonParam.images()],
+                    "block_name": "decoder",
+                },
+            },
+            label="My Pipeline",
+            default_repo="user/my-pipeline",
+            default_dtype="float16",
+        )
+
+        # Access Mellon format dict
+        denoise = config.node_params["denoise"]
+        input_names = denoise["input_names"]
+        params = denoise["params"]
+
+        # Save to Hub
+        config.save("./my_config", push_to_hub=True, repo_id="user/my-pipeline")
+
+        # Load from Hub
+        loaded = MellonPipelineConfig.load("user/my-pipeline")
+        ```
+    """
+
+    config_name = "mellon_pipeline_config.json"
+
+    def __init__(
+        self,
+        node_specs: Dict[str, Optional[Dict[str, Any]]],
+        label: str = "",
+        default_repo: str = "",
+        default_dtype: str = "",
+    ):
+        """
+        Args:
+            node_specs: Dict mapping node_type to node spec or None.
+                        Node spec has: inputs, model_inputs, outputs, required_inputs, required_model_inputs,
+                        block_name (all optional)
+            label: Human-readable label for the pipeline
+            default_repo: Default HuggingFace repo for this pipeline
+            default_dtype: Default dtype (e.g., "float16", "bfloat16")
+        """
+        # Convert all node specs to Mellon format immediately
+        self.node_params = {}
+        for node_type, spec in node_specs.items():
+            if spec is None:
+                self.node_params[node_type] = None
+            else:
+                self.node_params[node_type] = node_spec_to_mellon_dict(spec, node_type)
+
+        self.label = label
+        self.default_repo = default_repo
+        self.default_dtype = default_dtype
+
+    def __repr__(self) -> str:
+        node_types = list(self.node_params.keys())
+        return f"MellonPipelineConfig(label={self.label!r}, default_repo={self.default_repo!r}, default_dtype={self.default_dtype!r}, node_params={node_types})"
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to a JSON-serializable dictionary."""
+        return {
+            "label": self.label,
+            "default_repo": self.default_repo,
+            "default_dtype": self.default_dtype,
+            "node_params": self.node_params,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "MellonPipelineConfig":
+        """
+        Create from a dictionary (loaded from JSON).
+
+        Note: The mellon_params are already in Mellon format when loading from JSON.
+        """
+        instance = cls.__new__(cls)
+        instance.node_params = data.get("node_params", {})
+        instance.label = data.get("label", "")
+        instance.default_repo = data.get("default_repo", "")
+        instance.default_dtype = data.get("default_dtype", "")
+        return instance
+
+    def to_json_string(self) -> str:
+        """Serialize to JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=False) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """Save to a JSON file."""
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    @classmethod
+    def from_json_file(cls, json_file_path: Union[str, os.PathLike]) -> "MellonPipelineConfig":
+        """Load from a JSON file."""
+        with open(json_file_path, "r", encoding="utf-8") as reader:
+            data = json.load(reader)
+        return cls.from_dict(data)
+
+    def save(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """Save the pipeline config to a directory."""
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+        output_path = os.path.join(save_directory, self.config_name)
+        self.to_json_file(output_path)
+        logger.info(f"Pipeline config saved to {output_path}")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", None)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+            subfolder = kwargs.pop("subfolder", None)
+
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=save_directory,
+                token=token,
+                commit_message=commit_message or "Upload MellonPipelineConfig",
+                create_pr=create_pr,
+                path_in_repo=subfolder,
+            )
+            logger.info(f"Pipeline config pushed to hub: {repo_id}")
+
+    @classmethod
+    def load(
         cls,
         pretrained_model_name_or_path: Union[str, os.PathLike],
-        return_unused_kwargs=False,
-        return_commit_hash=False,
         **kwargs,
-    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        r"""
-        Load a model or scheduler configuration.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
-                      [`~ConfigMixin.save_config`].
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False):
-                Whether unused keyword arguments of the config are returned.
-            return_commit_hash (`bool`, *optional*, defaults to `False):
-                Whether the `commit_hash` of the loaded configuration are returned.
-
-        Returns:
-            `dict`:
-                A dictionary of all the parameters stored in a JSON configuration file.
-
-        """
+    ) -> "MellonPipelineConfig":
+        """Load a pipeline config from a local path or Hugging Face Hub."""
         cache_dir = kwargs.pop("cache_dir", None)
         local_dir = kwargs.pop("local_dir", None)
         local_dir_use_symlinks = kwargs.pop("local_dir_use_symlinks", "auto")
@@ -450,27 +526,18 @@ class MellonNodeConfig(PushToHubMixin):
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
         revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
 
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
 
-        if cls.config_name is None:
-            raise ValueError(
-                "`self.config_name` is not defined. Note that one should not load a config from "
-                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
-            )
         if os.path.isfile(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
         elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
-                # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
-            else:
-                raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
-                )
+            config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            if not os.path.isfile(config_file):
+                raise EnvironmentError(f"No file named {cls.config_name} found in {pretrained_model_name_or_path}")
         else:
             try:
-                # Load from URL or cache if already cached
                 config_file = hf_hub_download(
                     pretrained_model_name_or_path,
                     filename=cls.config_name,
@@ -480,6 +547,7 @@ class MellonNodeConfig(PushToHubMixin):
                     local_files_only=local_files_only,
                     token=token,
                     revision=revision,
+                    subfolder=subfolder,
                     local_dir=local_dir,
                     local_dir_use_symlinks=local_dir_use_symlinks,
                 )
@@ -519,245 +587,8 @@ class MellonNodeConfig(PushToHubMixin):
                     f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                     f"containing a {cls.config_name} file"
                 )
+
         try:
-            with open(config_file, "r", encoding="utf-8") as reader:
-                text = reader.read()
-            config_dict = json.loads(text)
-
-            commit_hash = extract_commit_hash(config_file)
+            return cls.from_json_file(config_file)
         except (json.JSONDecodeError, UnicodeDecodeError):
-            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
-
-        if not (return_unused_kwargs or return_commit_hash):
-            return config_dict
-
-        outputs = (config_dict,)
-
-        if return_unused_kwargs:
-            outputs += (kwargs,)
-
-        if return_commit_hash:
-            outputs += (commit_hash,)
-
-        return outputs
-
-    def save_mellon_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save the Mellon node definition to a JSON file.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file is saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        # If we save using the predefined names, we can load using `from_config`
-        output_config_file = os.path.join(save_directory, self.config_name)
-
-        self.to_json_file(output_config_file)
-        logger.info(f"Mellon node definition saved in {output_config_file}")
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            private = kwargs.pop("private", None)
-            create_pr = kwargs.pop("create_pr", False)
-            token = kwargs.pop("token", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
-            subfolder = kwargs.pop("subfolder", None)
-
-            self._upload_folder(
-                save_directory,
-                repo_id,
-                token=token,
-                commit_message=commit_message,
-                create_pr=create_pr,
-                subfolder=subfolder,
-            )
-
-    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
-        """
-        Save the Mellon schema dictionary to a JSON file.
-
-        Args:
-            json_file_path (`str` or `os.PathLike`):
-                Path to the JSON file to save a configuration instance's parameters.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def to_json_string(self) -> str:
-        """
-        Serializes this instance to a JSON string of the Mellon schema dict.
-
-        Args:
-        Returns:
-            `str`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-
-        mellon_dict = self.to_mellon_dict()
-        return json.dumps(mellon_dict, indent=2, sort_keys=True) + "\n"
-
-    def to_mellon_dict(self) -> Dict[str, Any]:
-        """Return a JSON-serializable dict focusing on the Mellon schema fields only.
-
-        params is a single flat dict composed as: {**inputs, **model_inputs, **outputs}.
-        """
-        # inputs/model_inputs/outputs are already normalized dicts
-        merged_params = {}
-        merged_params.update(self.inputs or {})
-        merged_params.update(self.model_inputs or {})
-        merged_params.update(self.outputs or {})
-
-        return {
-            "node_type": self.node_type,
-            "blocks_names": self.blocks_names,
-            "params": merged_params,
-        }
-
-    @classmethod
-    def from_mellon_dict(cls, mellon_dict: Dict[str, Any]) -> "MellonNodeConfig":
-        """Create a config from a Mellon schema dict produced by to_mellon_dict().
-
-        Splits the flat params dict back into inputs/model_inputs/outputs using the known key spaces from
-        MELLON_INPUT_PARAMS / MELLON_MODEL_PARAMS / MELLON_OUTPUT_PARAMS. Unknown keys are treated as inputs by
-        default.
-        """
-        flat_params = mellon_dict.get("params", {})
-
-        inputs: Dict[str, Any] = {}
-        model_inputs: Dict[str, Any] = {}
-        outputs: Dict[str, Any] = {}
-
-        for param_name, param_dict in flat_params.items():
-            if param_dict.get("display", "") == "output":
-                outputs[param_name] = param_dict
-            elif param_dict.get("type", "") in ("diffusers_auto_model", "diffusers_auto_models"):
-                model_inputs[param_name] = param_dict
-            else:
-                inputs[param_name] = param_dict
-
-        return cls(
-            inputs=inputs,
-            model_inputs=model_inputs,
-            outputs=outputs,
-            blocks_names=mellon_dict.get("blocks_names", []),
-            node_type=mellon_dict.get("node_type"),
-        )
-
-    # YiYi Notes: not used yet
-    @classmethod
-    def from_blocks(cls, blocks: ModularPipelineBlocks, node_type: str) -> "MellonNodeConfig":
-        """
-        Create an instance from a ModularPipeline object. If a preset exists in NODE_TYPE_PARAMS_MAP for the node_type,
-        use it; otherwise fall back to deriving lists from the pipeline's expected inputs/components/outputs.
-        """
-        if node_type not in NODE_TYPE_PARAMS_MAP:
-            raise ValueError(f"Node type {node_type} not supported")
-
-        blocks_names = list(blocks.sub_blocks.keys())
-
-        default_node_config = NODE_TYPE_PARAMS_MAP[node_type]
-        inputs_list: List[Union[str, MellonParam]] = default_node_config.get("inputs", [])
-        model_inputs_list: List[Union[str, MellonParam]] = default_node_config.get("model_inputs", [])
-        outputs_list: List[Union[str, MellonParam]] = default_node_config.get("outputs", [])
-
-        for required_input_name in blocks.required_inputs:
-            if required_input_name not in inputs_list:
-                inputs_list.append(
-                    MellonParam(
-                        name=required_input_name, label=required_input_name, type=required_input_name, display="input"
-                    )
-                )
-
-        for component_spec in blocks.expected_components:
-            if component_spec.name not in model_inputs_list:
-                model_inputs_list.append(
-                    MellonParam(
-                        name=component_spec.name,
-                        label=component_spec.name,
-                        type="diffusers_auto_model",
-                        display="input",
-                    )
-                )
-
-        return cls(
-            inputs=inputs_list,
-            model_inputs=model_inputs_list,
-            outputs=outputs_list,
-            blocks_names=blocks_names,
-            node_type=node_type,
-        )
-
-
-# Minimal modular registry for Mellon node configs
-class ModularMellonNodeRegistry:
-    """Registry mapping (pipeline class, blocks_name) -> list of MellonNodeConfig."""
-
-    def __init__(self):
-        self._registry = {}
-        self._initialized = False
-
-    def register(self, pipeline_cls: type, node_params: Dict[str, MellonNodeConfig]):
-        if not self._initialized:
-            _initialize_registry(self)
-        self._registry[pipeline_cls] = node_params
-
-    def get(self, pipeline_cls: type) -> MellonNodeConfig:
-        if not self._initialized:
-            _initialize_registry(self)
-        return self._registry.get(pipeline_cls, None)
-
-    def get_all(self) -> Dict[type, Dict[str, MellonNodeConfig]]:
-        if not self._initialized:
-            _initialize_registry(self)
-        return self._registry
-
-
-def _register_preset_node_types(
-    pipeline_cls, params_map: Dict[str, Dict[str, Any]], registry: ModularMellonNodeRegistry
-):
-    """Register all node-type presets for a given pipeline class from a params map."""
-    node_configs = {}
-    for node_type, spec in params_map.items():
-        node_config = MellonNodeConfig(
-            inputs=spec.get("inputs", []),
-            model_inputs=spec.get("model_inputs", []),
-            outputs=spec.get("outputs", []),
-            blocks_names=spec.get("block_names", []),
-            node_type=node_type,
-        )
-        node_configs[node_type] = node_config
-    registry.register(pipeline_cls, node_configs)
-
-
-def _initialize_registry(registry: ModularMellonNodeRegistry):
-    """Initialize the registry and register all available pipeline configs."""
-    print("Initializing registry")
-
-    registry._initialized = True
-
-    try:
-        from .qwenimage.modular_pipeline import QwenImageModularPipeline
-        from .qwenimage.node_utils import QwenImage_NODE_TYPES_PARAMS_MAP
-
-        _register_preset_node_types(QwenImageModularPipeline, QwenImage_NODE_TYPES_PARAMS_MAP, registry)
-    except Exception:
-        raise Exception("Failed to register QwenImageModularPipeline")
-
-    try:
-        from .stable_diffusion_xl.modular_pipeline import StableDiffusionXLModularPipeline
-        from .stable_diffusion_xl.node_utils import SDXL_NODE_TYPES_PARAMS_MAP
-
-        _register_preset_node_types(StableDiffusionXLModularPipeline, SDXL_NODE_TYPES_PARAMS_MAP, registry)
-    except Exception:
-        raise Exception("Failed to register StableDiffusionXLModularPipeline")
+            raise EnvironmentError(f"The config file at '{config_file}' is not a valid JSON file.")
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index 17c0117bff..c5fa4cf992 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -501,15 +501,19 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
 
     @property
     def input_names(self) -> List[str]:
-        return [input_param.name for input_param in self.inputs]
+        return [input_param.name for input_param in self.inputs if input_param.name is not None]
 
     @property
     def intermediate_output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.intermediate_outputs]
+        return [output_param.name for output_param in self.intermediate_outputs if output_param.name is not None]
 
     @property
     def output_names(self) -> List[str]:
-        return [output_param.name for output_param in self.outputs]
+        return [output_param.name for output_param in self.outputs if output_param.name is not None]
+
+    @property
+    def component_names(self) -> List[str]:
+        return [component.name for component in self.expected_components]
 
     @property
     def doc(self):
@@ -1525,10 +1529,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
         if blocks is None:
             if modular_config_dict is not None:
                 blocks_class_name = modular_config_dict.get("_blocks_class_name")
-            elif config_dict is not None:
-                blocks_class_name = self.get_default_blocks_name(config_dict)
             else:
-                blocks_class_name = None
+                blocks_class_name = self.get_default_blocks_name(config_dict)
             if blocks_class_name is not None:
                 diffusers_module = importlib.import_module("diffusers")
                 blocks_class = getattr(diffusers_module, blocks_class_name)
@@ -1625,7 +1627,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
             return None, config_dict
 
         except EnvironmentError as e:
-            logger.debug(f" model_index.json not found in the repo: {e}")
+            raise EnvironmentError(
+                f"Failed to load config from '{pretrained_model_name_or_path}'. "
+                f"Could not find or load 'modular_model_index.json' or 'model_index.json'."
+            ) from e
 
         return None, None
 
@@ -2550,7 +2555,11 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
             kwargs_type = expected_input_param.kwargs_type
             if name in passed_kwargs:
                 state.set(name, passed_kwargs.pop(name), kwargs_type)
-            elif name not in state.values:
+            elif kwargs_type is not None and kwargs_type in passed_kwargs:
+                kwargs_dict = passed_kwargs.pop(kwargs_type)
+                for k, v in kwargs_dict.items():
+                    state.set(k, v, kwargs_type)
+            elif name is not None and name not in state.values:
                 state.set(name, default, kwargs_type)
 
         # Warn about unexpected inputs
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 26417162de..6e145f1855 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -30,6 +30,47 @@ from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
 logger = logging.get_logger(__name__)
 
 
+class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        components = [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
+        return components
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        vae_scale_factor = components.vae_scale_factor
+        block_state.latents = components.pachifier.unpack_latents(
+            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class QwenImageDecoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
@@ -41,7 +82,6 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
     def expected_components(self) -> List[ComponentSpec]:
         components = [
             ComponentSpec("vae", AutoencoderKLQwenImage),
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
         ]
 
         return components
@@ -49,8 +89,6 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
             InputParam(
                 name="latents",
                 required=True,
@@ -74,10 +112,12 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
         block_state = self.get_block_state(state)
 
         # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
-        vae_scale_factor = components.vae_scale_factor
-        block_state.latents = components.pachifier.unpack_latents(
-            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
-        )
+        if block_state.latents.ndim == 4:
+            block_state.latents = block_state.latents.unsqueeze(dim=1)
+        elif block_state.latents.ndim != 5:
+            raise ValueError(
+                f"expect latents to be a 4D or 5D tensor but got: {block_state.latents.shape}. Please make sure the latents are unpacked before decode step."
+            )
         block_state.latents = block_state.latents.to(components.vae.dtype)
 
         latents_mean = (
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
index 55a7ae328f..dcce0cab5d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -26,7 +26,12 @@ from .before_denoise import (
     QwenImageSetTimestepsStep,
     QwenImageSetTimestepsWithStrengthStep,
 )
-from .decoders import QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep
+from .decoders import (
+    QwenImageAfterDenoiseStep,
+    QwenImageDecoderStep,
+    QwenImageInpaintProcessImagesOutputStep,
+    QwenImageProcessImagesOutputStep,
+)
 from .denoise import (
     QwenImageControlNetDenoiseStep,
     QwenImageDenoiseStep,
@@ -92,6 +97,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -205,6 +211,7 @@ INPAINT_BLOCKS = InsertableDict(
         ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageInpaintDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageInpaintDecodeStep()),
     ]
 )
@@ -264,6 +271,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
         ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -529,8 +537,16 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageAutoBeforeDenoiseStep,
         QwenImageOptionalControlNetBeforeDenoiseStep,
         QwenImageAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "controlnet_input",
+        "before_denoise",
+        "controlnet_before_denoise",
+        "denoise",
+        "after_denoise",
     ]
-    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
 
     @property
     def description(self):
@@ -653,6 +669,7 @@ EDIT_BLOCKS = InsertableDict(
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
         ("denoise", QwenImageEditDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -702,6 +719,7 @@ EDIT_INPAINT_BLOCKS = InsertableDict(
         ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
         ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
         ("denoise", QwenImageEditInpaintDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageInpaintDecodeStep()),
     ]
 )
@@ -841,8 +859,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageEditAutoInputStep,
         QwenImageEditAutoBeforeDenoiseStep,
         QwenImageEditAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
     ]
-    block_names = ["input", "before_denoise", "denoise"]
+    block_names = ["input", "before_denoise", "denoise", "after_denoise"]
 
     @property
     def description(self):
@@ -954,6 +973,7 @@ EDIT_PLUS_BLOCKS = InsertableDict(
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()),
         ("denoise", QwenImageEditDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -1037,8 +1057,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageEditPlusAutoInputStep,
         QwenImageEditPlusAutoBeforeDenoiseStep,
         QwenImageEditAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
     ]
-    block_names = ["input", "before_denoise", "denoise"]
+    block_names = ["input", "before_denoise", "denoise", "after_denoise"]
 
     @property
     def description(self):
diff --git a/src/diffusers/modular_pipelines/qwenimage/node_utils.py b/src/diffusers/modular_pipelines/qwenimage/node_utils.py
deleted file mode 100644
index 3230ece68a..0000000000
--- a/src/diffusers/modular_pipelines/qwenimage/node_utils.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# mellon nodes
-QwenImage_NODE_TYPES_PARAMS_MAP = {
-    "controlnet": {
-        "inputs": [
-            "control_image",
-            "controlnet_conditioning_scale",
-            "control_guidance_start",
-            "control_guidance_end",
-            "height",
-            "width",
-        ],
-        "model_inputs": [
-            "controlnet",
-            "vae",
-        ],
-        "outputs": [
-            "controlnet_out",
-        ],
-        "block_names": ["controlnet_vae_encoder"],
-    },
-    "denoise": {
-        "inputs": [
-            "embeddings",
-            "width",
-            "height",
-            "seed",
-            "num_inference_steps",
-            "guidance_scale",
-            "image_latents",
-            "strength",
-            "controlnet",
-        ],
-        "model_inputs": [
-            "unet",
-            "guider",
-            "scheduler",
-        ],
-        "outputs": [
-            "latents",
-            "latents_preview",
-        ],
-        "block_names": ["denoise"],
-    },
-    "vae_encoder": {
-        "inputs": [
-            "image",
-            "width",
-            "height",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "image_latents",
-        ],
-    },
-    "text_encoder": {
-        "inputs": [
-            "prompt",
-            "negative_prompt",
-        ],
-        "model_inputs": [
-            "text_encoders",
-        ],
-        "outputs": [
-            "embeddings",
-        ],
-    },
-    "decoder": {
-        "inputs": [
-            "latents",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "images",
-        ],
-    },
-}
diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
deleted file mode 100644
index 3e788bf947..0000000000
--- a/src/diffusers/modular_pipelines/stable_diffusion_xl/node_utils.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-SDXL_NODE_TYPES_PARAMS_MAP = {
-    "controlnet": {
-        "inputs": [
-            "control_image",
-            "controlnet_conditioning_scale",
-            "control_guidance_start",
-            "control_guidance_end",
-            "height",
-            "width",
-        ],
-        "model_inputs": [
-            "controlnet",
-        ],
-        "outputs": [
-            "controlnet_out",
-        ],
-        "block_names": [None],
-    },
-    "denoise": {
-        "inputs": [
-            "embeddings",
-            "width",
-            "height",
-            "seed",
-            "num_inference_steps",
-            "guidance_scale",
-            "image_latents",
-            "strength",
-            # custom adapters coming in as inputs
-            "controlnet",
-            # ip_adapter is optional and custom; include if available
-            "ip_adapter",
-        ],
-        "model_inputs": [
-            "unet",
-            "guider",
-            "scheduler",
-        ],
-        "outputs": [
-            "latents",
-            "latents_preview",
-        ],
-        "block_names": ["denoise"],
-    },
-    "vae_encoder": {
-        "inputs": [
-            "image",
-            "width",
-            "height",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "image_latents",
-        ],
-        "block_names": ["vae_encoder"],
-    },
-    "text_encoder": {
-        "inputs": [
-            "prompt",
-            "negative_prompt",
-        ],
-        "model_inputs": [
-            "text_encoders",
-        ],
-        "outputs": [
-            "embeddings",
-        ],
-        "block_names": ["text_encoder"],
-    },
-    "decoder": {
-        "inputs": [
-            "latents",
-        ],
-        "model_inputs": [
-            "vae",
-        ],
-        "outputs": [
-            "images",
-        ],
-        "block_names": ["decode"],
-    },
-}
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index ec815f77ad..3d5a00a9df 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -129,6 +129,10 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
+            InputParam(
+                kwargs_type="denoiser_input_fields",
+                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+            ),
         ]
         guider_input_names = []
         uncond_guider_input_names = []
diff --git a/src/diffusers/modular_pipelines/z_image/modular_blocks.py b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
index a7c520301a..a54baeccaf 100644
--- a/src/diffusers/modular_pipelines/z_image/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/z_image/modular_blocks.py
@@ -119,7 +119,7 @@ class ZImageAutoDenoiseStep(AutoPipelineBlocks):
 
 class ZImageAutoVaeImageEncoderStep(AutoPipelineBlocks):
     block_classes = [ZImageVaeImageEncoderStep]
-    block_names = ["vae_image_encoder"]
+    block_names = ["vae_encoder"]
     block_trigger_inputs = ["image"]
 
     @property
@@ -137,7 +137,7 @@ class ZImageAutoBlocks(SequentialPipelineBlocks):
         ZImageAutoDenoiseStep,
         ZImageVaeDecoderStep,
     ]
-    block_names = ["text_encoder", "vae_image_encoder", "denoise", "decode"]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
 
     @property
     def description(self) -> str:
@@ -162,7 +162,7 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
 IMAGE2IMAGE_BLOCKS = InsertableDict(
     [
         ("text_encoder", ZImageTextEncoderStep),
-        ("vae_image_encoder", ZImageVaeImageEncoderStep),
+        ("vae_encoder", ZImageVaeImageEncoderStep),
         ("input", ZImageTextInputStep),
         ("additional_inputs", ZImageAdditionalInputsStep(image_latent_inputs=["image_latents"])),
         ("prepare_latents", ZImagePrepareLatentsStep),
@@ -178,7 +178,7 @@ IMAGE2IMAGE_BLOCKS = InsertableDict(
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", ZImageTextEncoderStep),
-        ("vae_image_encoder", ZImageAutoVaeImageEncoderStep),
+        ("vae_encoder", ZImageAutoVaeImageEncoderStep),
         ("denoise", ZImageAutoDenoiseStep),
         ("decode", ZImageVaeDecoderStep),
     ]

From 262ce19bff6b19e38aed3519fc9eb2d90d24f87a Mon Sep 17 00:00:00 2001
From: MatrixTeam-AI <gameworldengine@gmail.com>
Date: Sat, 20 Dec 2025 07:10:40 +0800
Subject: [PATCH 11/11] Feature: Add Mambo-G Guidance as Guider (#12862)

* Feature: Add Mambo-G Guidance to Qwen-Image Pipeline

* change to guider implementation

* fix copied code residual

* Update src/diffusers/guiders/magnitude_aware_guidance.py

* Apply style fixes

---------

Co-authored-by: Pscgylotti <pscgylotti@github.com>
Co-authored-by: YiYi Xu <yixu310@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/diffusers/guiders/__init__.py             |   1 +
 .../guiders/magnitude_aware_guidance.py       | 159 ++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 src/diffusers/guiders/magnitude_aware_guidance.py

diff --git a/src/diffusers/guiders/__init__.py b/src/diffusers/guiders/__init__.py
index 4e53c373c4..58ad0c211b 100644
--- a/src/diffusers/guiders/__init__.py
+++ b/src/diffusers/guiders/__init__.py
@@ -25,6 +25,7 @@ if is_torch_available():
     from .classifier_free_zero_star_guidance import ClassifierFreeZeroStarGuidance
     from .frequency_decoupled_guidance import FrequencyDecoupledGuidance
     from .guider_utils import BaseGuidance
+    from .magnitude_aware_guidance import MagnitudeAwareGuidance
     from .perturbed_attention_guidance import PerturbedAttentionGuidance
     from .skip_layer_guidance import SkipLayerGuidance
     from .smoothed_energy_guidance import SmoothedEnergyGuidance
diff --git a/src/diffusers/guiders/magnitude_aware_guidance.py b/src/diffusers/guiders/magnitude_aware_guidance.py
new file mode 100644
index 0000000000..b81cf0d3a1
--- /dev/null
+++ b/src/diffusers/guiders/magnitude_aware_guidance.py
@@ -0,0 +1,159 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import register_to_config
+from .guider_utils import BaseGuidance, GuiderOutput, rescale_noise_cfg
+
+
+if TYPE_CHECKING:
+    from ..modular_pipelines.modular_pipeline import BlockState
+
+
+class MagnitudeAwareGuidance(BaseGuidance):
+    """
+    Magnitude-Aware Mitigation for Boosted Guidance (MAMBO-G): https://huggingface.co/papers/2508.03442
+
+    Args:
+        guidance_scale (`float`, defaults to `10.0`):
+            The scale parameter for classifier-free guidance. Higher values result in stronger conditioning on the text
+            prompt, while lower values allow for more freedom in generation. Higher values may lead to saturation and
+            deterioration of image quality.
+        alpha (`float`, defaults to `8.0`):
+            The alpha parameter for the magnitude-aware guidance. Higher values cause more aggressive supression of
+            guidance scale when the magnitude of the guidance update is large.
+        guidance_rescale (`float`, defaults to `0.0`):
+            The rescale factor applied to the noise predictions. This is used to improve image quality and fix
+            overexposure. Based on Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+            Flawed](https://huggingface.co/papers/2305.08891).
+        use_original_formulation (`bool`, defaults to `False`):
+            Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
+            we use the diffusers-native implementation that has been in the codebase for a long time. See
+            [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        start (`float`, defaults to `0.0`):
+            The fraction of the total number of denoising steps after which guidance starts.
+        stop (`float`, defaults to `1.0`):
+            The fraction of the total number of denoising steps after which guidance stops.
+    """
+
+    _input_predictions = ["pred_cond", "pred_uncond"]
+
+    @register_to_config
+    def __init__(
+        self,
+        guidance_scale: float = 10.0,
+        alpha: float = 8.0,
+        guidance_rescale: float = 0.0,
+        use_original_formulation: bool = False,
+        start: float = 0.0,
+        stop: float = 1.0,
+        enabled: bool = True,
+    ):
+        super().__init__(start, stop, enabled)
+
+        self.guidance_scale = guidance_scale
+        self.alpha = alpha
+        self.guidance_rescale = guidance_rescale
+        self.use_original_formulation = use_original_formulation
+
+    def prepare_inputs(self, data: Dict[str, Tuple[torch.Tensor, torch.Tensor]]) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch(data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
+    def prepare_inputs_from_block_state(
+        self, data: "BlockState", input_fields: Dict[str, Union[str, Tuple[str, str]]]
+    ) -> List["BlockState"]:
+        tuple_indices = [0] if self.num_conditions == 1 else [0, 1]
+        data_batches = []
+        for tuple_idx, input_prediction in zip(tuple_indices, self._input_predictions):
+            data_batch = self._prepare_batch_from_block_state(input_fields, data, tuple_idx, input_prediction)
+            data_batches.append(data_batch)
+        return data_batches
+
+    def forward(self, pred_cond: torch.Tensor, pred_uncond: Optional[torch.Tensor] = None) -> GuiderOutput:
+        pred = None
+
+        if not self._is_mambo_g_enabled():
+            pred = pred_cond
+        else:
+            pred = mambo_guidance(
+                pred_cond,
+                pred_uncond,
+                self.guidance_scale,
+                self.alpha,
+                self.use_original_formulation,
+            )
+
+        if self.guidance_rescale > 0.0:
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+
+        return GuiderOutput(pred=pred, pred_cond=pred_cond, pred_uncond=pred_uncond)
+
+    @property
+    def is_conditional(self) -> bool:
+        return self._count_prepared == 1
+
+    @property
+    def num_conditions(self) -> int:
+        num_conditions = 1
+        if self._is_mambo_g_enabled():
+            num_conditions += 1
+        return num_conditions
+
+    def _is_mambo_g_enabled(self) -> bool:
+        if not self._enabled:
+            return False
+
+        is_within_range = True
+        if self._num_inference_steps is not None:
+            skip_start_step = int(self._start * self._num_inference_steps)
+            skip_stop_step = int(self._stop * self._num_inference_steps)
+            is_within_range = skip_start_step <= self._step < skip_stop_step
+
+        is_close = False
+        if self.use_original_formulation:
+            is_close = math.isclose(self.guidance_scale, 0.0)
+        else:
+            is_close = math.isclose(self.guidance_scale, 1.0)
+
+        return is_within_range and not is_close
+
+
+def mambo_guidance(
+    pred_cond: torch.Tensor,
+    pred_uncond: torch.Tensor,
+    guidance_scale: float,
+    alpha: float = 8.0,
+    use_original_formulation: bool = False,
+):
+    dim = list(range(1, len(pred_cond.shape)))
+    diff = pred_cond - pred_uncond
+    ratio = torch.norm(diff, dim=dim, keepdim=True) / torch.norm(pred_uncond, dim=dim, keepdim=True)
+    guidance_scale_final = (
+        guidance_scale * torch.exp(-alpha * ratio)
+        if use_original_formulation
+        else 1.0 + (guidance_scale - 1.0) * torch.exp(-alpha * ratio)
+    )
+    pred = pred_cond if use_original_formulation else pred_uncond
+    pred = pred + guidance_scale_final * diff
+
+    return pred