From b1c77f67ac9ae0054a3b54a3e8da9d277a83f389 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Mon, 26 Jan 2026 17:12:13 -1000
Subject: [PATCH 01/10] [modular] add auto_docstring & more doc related
 refactors  (#12958)

* up

* up up

* update outputs

* style

* add modular_auto_docstring!

* more auto docstring

* style

* up up up

* more more

* up

* address feedbacks

* add TODO in the description for empty docstring

* refactor based on dhruv's feedback: remove the class method

* add template method

* up

* up up up

* apply auto docstring

* make style

* rmove space in make docstring

* Apply suggestions from code review

* revert change in z

* fix

* Apply style fixes

* include auto-docstring check in the modular ci. (#13004)

* Run ruff format after auto docstring generation

* up

* upup

* upup

* style

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .github/workflows/pr_modular_tests.yml        |  20 +-
 Makefile                                      |   4 +
 .../modular_pipeline_utils.py                 | 248 ++++-
 .../qwenimage/before_denoise.py               | 507 +++++++++--
 .../modular_pipelines/qwenimage/decoders.py   | 218 ++++-
 .../modular_pipelines/qwenimage/denoise.py    | 380 ++++++--
 .../modular_pipelines/qwenimage/encoders.py   | 854 ++++++++++++------
 .../modular_pipelines/qwenimage/inputs.py     | 492 ++++++++--
 .../qwenimage/modular_blocks_qwenimage.py     | 760 +++++++++++++++-
 .../modular_blocks_qwenimage_edit.py          | 459 +++++++++-
 .../modular_blocks_qwenimage_edit_plus.py     | 237 ++++-
 .../modular_blocks_qwenimage_layered.py       | 214 ++++-
 .../modular_pipelines/z_image/denoise.py      |   2 +-
 utils/modular_auto_docstring.py               | 352 ++++++++
 14 files changed, 4106 insertions(+), 641 deletions(-)
 create mode 100644 utils/modular_auto_docstring.py

diff --git a/.github/workflows/pr_modular_tests.yml b/.github/workflows/pr_modular_tests.yml
index 3bdfb4ca99..89b502d364 100644
--- a/.github/workflows/pr_modular_tests.yml
+++ b/.github/workflows/pr_modular_tests.yml
@@ -75,9 +75,27 @@ jobs:
         if: ${{ failure() }}
         run: |
           echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+  check_auto_docs:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v6
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[quality]
+      - name: Check auto docs
+        run: make modular-autodoctrings
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Auto docstring checks failed. Please run `python utils/modular_auto_docstring.py --fix_and_overwrite`." >> $GITHUB_STEP_SUMMARY
 
   run_fast_tests:
-    needs: [check_code_quality, check_repository_consistency]
+    needs: [check_code_quality, check_repository_consistency, check_auto_docs]
     name: Fast PyTorch Modular Pipeline CPU tests
 
     runs-on:
diff --git a/Makefile b/Makefile
index 9af2e8b1a5..b90ff82ab2 100644
--- a/Makefile
+++ b/Makefile
@@ -70,6 +70,10 @@ fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 
+# Auto docstrings in modular blocks
+modular-autodoctrings:
+	python utils/modular_auto_docstring.py
+
 # Run tests for the library
 
 test:
diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index aa421a5372..f3b12d7161 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -18,6 +18,7 @@ from collections import OrderedDict
 from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union
 
+import PIL.Image
 import torch
 
 from ..configuration_utils import ConfigMixin, FrozenDict
@@ -323,11 +324,192 @@ class ConfigSpec:
     description: Optional[str] = None
 
 
-# YiYi Notes: both inputs and intermediate_inputs are InputParam objects
-# however some fields are not relevant for intermediate_inputs
-# e.g. unlike inputs, required only used in docstring for intermediate_inputs, we do not check if a required intermediate inputs is passed
-# default is not used for intermediate_inputs, we only use default from inputs, so it is ignored if it is set for intermediate_inputs
-# -> should we use different class for inputs and intermediate_inputs?
+# ======================================================
+# InputParam and OutputParam templates
+# ======================================================
+
+INPUT_PARAM_TEMPLATES = {
+    "prompt": {
+        "type_hint": str,
+        "required": True,
+        "description": "The prompt or prompts to guide image generation.",
+    },
+    "negative_prompt": {
+        "type_hint": str,
+        "description": "The prompt or prompts not to guide the image generation.",
+    },
+    "max_sequence_length": {
+        "type_hint": int,
+        "default": 512,
+        "description": "Maximum sequence length for prompt encoding.",
+    },
+    "height": {
+        "type_hint": int,
+        "description": "The height in pixels of the generated image.",
+    },
+    "width": {
+        "type_hint": int,
+        "description": "The width in pixels of the generated image.",
+    },
+    "num_inference_steps": {
+        "type_hint": int,
+        "default": 50,
+        "description": "The number of denoising steps.",
+    },
+    "num_images_per_prompt": {
+        "type_hint": int,
+        "default": 1,
+        "description": "The number of images to generate per prompt.",
+    },
+    "generator": {
+        "type_hint": torch.Generator,
+        "description": "Torch generator for deterministic generation.",
+    },
+    "sigmas": {
+        "type_hint": List[float],
+        "description": "Custom sigmas for the denoising process.",
+    },
+    "strength": {
+        "type_hint": float,
+        "default": 0.9,
+        "description": "Strength for img2img/inpainting.",
+    },
+    "image": {
+        "type_hint": Union[PIL.Image.Image, List[PIL.Image.Image]],
+        "required": True,
+        "description": "Reference image(s) for denoising. Can be a single image or list of images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Pre-generated noisy latents for image generation.",
+    },
+    "timesteps": {
+        "type_hint": torch.Tensor,
+        "description": "Timesteps for the denoising process.",
+    },
+    "output_type": {
+        "type_hint": str,
+        "default": "pil",
+        "description": "Output format: 'pil', 'np', 'pt'.",
+    },
+    "attention_kwargs": {
+        "type_hint": Dict[str, Any],
+        "description": "Additional kwargs for attention processors.",
+    },
+    "denoiser_input_fields": {
+        "name": None,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+    },
+    # inpainting
+    "mask_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Mask image for inpainting.",
+    },
+    "padding_mask_crop": {
+        "type_hint": int,
+        "description": "Padding for mask cropping in inpainting.",
+    },
+    # controlnet
+    "control_image": {
+        "type_hint": PIL.Image.Image,
+        "required": True,
+        "description": "Control image for ControlNet conditioning.",
+    },
+    "control_guidance_start": {
+        "type_hint": float,
+        "default": 0.0,
+        "description": "When to start applying ControlNet.",
+    },
+    "control_guidance_end": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "When to stop applying ControlNet.",
+    },
+    "controlnet_conditioning_scale": {
+        "type_hint": float,
+        "default": 1.0,
+        "description": "Scale for ControlNet conditioning.",
+    },
+    "layers": {
+        "type_hint": int,
+        "default": 4,
+        "description": "Number of layers to extract from the image",
+    },
+    # common intermediate inputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "mask for the text embeddings. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "description": "negative text embeddings used to guide the image generation. Can be generated from text_encoder step.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "description": "mask for the negative text embeddings. Can be generated from text_encoder step.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "required": True,
+        "description": "image latents used to guide the image generation. Can be generated from vae_encoder step.",
+    },
+    "batch_size": {
+        "type_hint": int,
+        "default": 1,
+        "description": "Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
+    },
+    "dtype": {
+        "type_hint": torch.dtype,
+        "default": torch.float32,
+        "description": "The dtype of the model inputs, can be generated in input step.",
+    },
+}
+
+OUTPUT_PARAM_TEMPLATES = {
+    "images": {
+        "type_hint": List[PIL.Image.Image],
+        "description": "Generated images.",
+    },
+    "latents": {
+        "type_hint": torch.Tensor,
+        "description": "Denoised latents.",
+    },
+    # intermediate outputs
+    "prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The prompt embeddings.",
+    },
+    "prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The encoder attention mask.",
+    },
+    "negative_prompt_embeds": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings.",
+    },
+    "negative_prompt_embeds_mask": {
+        "type_hint": torch.Tensor,
+        "kwargs_type": "denoiser_input_fields",
+        "description": "The negative prompt embeddings mask.",
+    },
+    "image_latents": {
+        "type_hint": torch.Tensor,
+        "description": "The latent representation of the input image.",
+    },
+}
+
+
 @dataclass
 class InputParam:
     """Specification for an input parameter."""
@@ -337,11 +519,31 @@ class InputParam:
     default: Any = None
     required: bool = False
     description: str = ""
-    kwargs_type: str = None  # YiYi Notes: remove this feature (maybe)
+    kwargs_type: str = None
 
     def __repr__(self):
         return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>"
 
+    @classmethod
+    def template(cls, template_name: str, note: str = None, **overrides) -> "InputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if template_name not in INPUT_PARAM_TEMPLATES:
+            raise ValueError(f"InputParam template for {template_name} not found")
+
+        template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
+
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
+
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
+
 
 @dataclass
 class OutputParam:
@@ -350,13 +552,33 @@ class OutputParam:
     name: str
     type_hint: Any = None
     description: str = ""
-    kwargs_type: str = None  # YiYi notes: remove this feature (maybe)
+    kwargs_type: str = None
 
     def __repr__(self):
         return (
             f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>"
         )
 
+    @classmethod
+    def template(cls, template_name: str, note: str = None, **overrides) -> "OutputParam":
+        """Get template for name if exists, otherwise raise ValueError."""
+        if template_name not in OUTPUT_PARAM_TEMPLATES:
+            raise ValueError(f"OutputParam template for {template_name} not found")
+
+        template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
+
+        # Determine the actual param name:
+        # 1. From overrides if provided
+        # 2. From template if present
+        # 3. Fall back to template_name
+        name = overrides.pop("name", template_kwargs.pop("name", template_name))
+
+        if note and "description" in template_kwargs:
+            template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
+
+        template_kwargs.update(overrides)
+        return cls(name=name, **template_kwargs)
+
 
 def format_inputs_short(inputs):
     """
@@ -509,10 +731,12 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
             desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
             wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
             param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."
 
         formatted_params.append(param_str)
 
-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)
 
 
 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -582,7 +806,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
         loading_field_values = []
         for field_name in component.loading_fields():
             field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                 loading_field_values.append(f"{field_name}={field_value}")
 
         # Add loading field information if available
@@ -669,17 +893,17 @@ def make_doc_string(
     # Add description
     if description:
         desc_lines = description.strip().split("\n")
-        aligned_desc = "\n".join("  " + line for line in desc_lines)
+        aligned_desc = "\n".join("  " + line.rstrip() for line in desc_lines)
         output += aligned_desc + "\n\n"
 
     # Add components section if provided
     if expected_components and len(expected_components) > 0:
-        components_str = format_components(expected_components, indent_level=2)
+        components_str = format_components(expected_components, indent_level=2, add_empty_lines=False)
         output += components_str + "\n\n"
 
     # Add configs section if provided
     if expected_configs and len(expected_configs) > 0:
-        configs_str = format_configs(expected_configs, indent_level=2)
+        configs_str = format_configs(expected_configs, indent_level=2, add_empty_lines=False)
         output += configs_str + "\n\n"
 
     # Add inputs section
diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
index d9c8cbb01d..80a379da6b 100644
--- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py
@@ -118,7 +118,40 @@ def get_timesteps(scheduler, num_inference_steps, strength):
 # ====================
 
 
+# auto_docstring
 class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise for the generation process
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -134,28 +167,20 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -209,7 +234,42 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Prepare initial random noise (B, layers+1, C, H, W) for the generation process
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          height (`int`):
+              if not set, updated to default value
+          width (`int`):
+              if not set, updated to default value
+          latents (`Tensor`):
+              The initial latents to use for the denoising process
+    """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -225,29 +285,21 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents"),
-            InputParam(name="height"),
-            InputParam(name="width"),
-            InputParam(name="layers", default=4),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="generator"),
-            InputParam(
-                name="batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
-            ),
-            InputParam(
-                name="dtype",
-                required=True,
-                type_hint=torch.dtype,
-                description="The dtype of the model inputs, can be generated in input step.",
-            ),
+            InputParam.template("latents"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("layers"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("generator"),
+            InputParam.template("batch_size"),
+            InputParam.template("dtype"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
+            OutputParam(name="height", type_hint=int, description="if not set, updated to default value"),
+            OutputParam(name="width", type_hint=int, description="if not set, updated to default value"),
             OutputParam(
                 name="latents",
                 type_hint=torch.Tensor,
@@ -301,7 +353,31 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
+    prepare_latents. Both noise and image latents should alreadybe patchified.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -323,12 +399,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="The initial random noised, can be generated in prepare latent step.",
             ),
-            InputParam(
-                name="image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
-            ),
+            InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
             InputParam(
                 name="timesteps",
                 required=True,
@@ -345,6 +416,11 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="The initial random noised used for inpainting denoising.",
             ),
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The scaled noisy latents to use for inpainting/image-to-image denoising.",
+            ),
         ]
 
     @staticmethod
@@ -382,7 +458,29 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
+    """
+    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -404,9 +502,9 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="The processed mask to use for the inpainting process.",
             ),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="dtype", required=True),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("dtype"),
         ]
 
     @property
@@ -450,7 +548,27 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
 # ====================
 
 
+# auto_docstring
 class QwenImageSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Step that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The initial random noised latents for the denoising process. Can be generated in prepare latents step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -466,13 +584,13 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
             InputParam(
                 name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
             ),
         ]
 
@@ -516,7 +634,27 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
+    """
+    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+    """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -532,15 +670,17 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("num_inference_steps", default=50, type_hint=int),
-            InputParam("sigmas", type_hint=List[float]),
-            InputParam("image_latents", required=True, type_hint=torch.Tensor),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+            InputParam.template("image_latents"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="timesteps", type_hint=torch.Tensor),
+            OutputParam(
+                name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
+            ),
         ]
 
     @torch.no_grad()
@@ -574,7 +714,32 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
+    """
+    Step that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare
+    latents step.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`):
+              The latents to use for the denoising process. Can be generated in prepare latents step.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+      Outputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process.
+          num_inference_steps (`int`):
+              The number of denoising steps to perform at inference time. Updated based on strength.
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -590,15 +755,15 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_inference_steps", default=50),
-            InputParam(name="sigmas"),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
             InputParam(
-                name="latents",
+                "latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The latents to use for the denoising process, used to calculate the image sequence length.",
+                description="The latents to use for the denoising process. Can be generated in prepare latents step.",
             ),
-            InputParam(name="strength", default=0.9),
+            InputParam.template("strength", default=0.9),
         ]
 
     @property
@@ -607,7 +772,12 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
             OutputParam(
                 name="timesteps",
                 type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
+                description="The timesteps to use for the denoising process.",
+            ),
+            OutputParam(
+                name="num_inference_steps",
+                type_hint=int,
+                description="The number of denoising steps to perform at inference time. Updated based on strength.",
             ),
         ]
 
@@ -654,7 +824,29 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
 ## RoPE inputs for denoiser
 
 
+# auto_docstring
 class QwenImageRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -666,11 +858,11 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -702,7 +894,34 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
+    prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_height (`int`):
+              The height of the reference image. Can be generated in input step.
+          image_width (`int`):
+              The width of the reference image. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the images latents, used for RoPE calculation
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -712,13 +931,23 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True),
-            InputParam(name="image_width", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=int,
+                description="The height of the reference image. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=int,
+                description="The width of the reference image. Can be generated in input step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -756,7 +985,39 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
+      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
+      after prepare_latents step.
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_height (`List`):
+              The heights of the reference images. Can be generated in input step.
+          image_width (`List`):
+              The widths of the reference images. Can be generated in input step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+    """
+
     model_name = "qwenimage-edit-plus"
 
     @property
@@ -770,13 +1031,23 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="image_height", required=True, type_hint=List[int]),
-            InputParam(name="image_width", required=True, type_hint=List[int]),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam(
+                name="image_height",
+                required=True,
+                type_hint=List[int],
+                description="The heights of the reference images. Can be generated in input step.",
+            ),
+            InputParam(
+                name="image_width",
+                required=True,
+                type_hint=List[int],
+                description="The widths of the reference images. Can be generated in input step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -832,7 +1103,37 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step
+
+      Inputs:
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          img_shapes (`List`):
+              The shapes of the image latents, used for RoPE calculation
+          txt_seq_lens (`List`):
+              The sequence lengths of the prompt embeds, used for RoPE calculation
+          negative_txt_seq_lens (`List`):
+              The sequence lengths of the negative prompt embeds, used for RoPE calculation
+          additional_t_cond (`Tensor`):
+              The additional t cond, used for RoPE calculation
+    """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -844,12 +1145,12 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="batch_size", required=True),
-            InputParam(name="layers", required=True),
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
-            InputParam(name="prompt_embeds_mask"),
-            InputParam(name="negative_prompt_embeds_mask"),
+            InputParam.template("batch_size"),
+            InputParam.template("layers"),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
@@ -914,7 +1215,34 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
 
 
 ## ControlNet inputs for denoiser
+
+
+# auto_docstring
 class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
+    """
+    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.
+
+      Components:
+          controlnet (`QwenImageControlNetModel`)
+
+      Inputs:
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+      Outputs:
+          controlnet_keep (`List`):
+              The controlnet keep values
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -930,12 +1258,17 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("control_guidance_start", default=0.0),
-            InputParam("control_guidance_end", default=1.0),
-            InputParam("controlnet_conditioning_scale", default=1.0),
-            InputParam("control_image_latents", required=True),
+            InputParam.template("control_guidance_start"),
+            InputParam.template("control_guidance_end"),
+            InputParam.template("controlnet_conditioning_scale"),
             InputParam(
-                "timesteps",
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
+            InputParam(
+                name="timesteps",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
index 24a88ebfca..1adbf6bdd3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/decoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Union
+from typing import Any, Dict, List
 
-import numpy as np
-import PIL
 import torch
 
 from ...configuration_utils import FrozenDict
@@ -31,7 +29,30 @@ logger = logging.get_logger(__name__)
 
 
 # after denoising loop (unpack latents)
+
+
+# auto_docstring
 class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
+    channels, 1, height, width)
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step.
+
+      Outputs:
+          latents (`Tensor`):
+              The denoisedlatents unpacked to B, C, 1, H, W
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -49,13 +70,21 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
             InputParam(
                 name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The latents to decode, can be generated in the denoise step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
             ),
         ]
 
@@ -72,7 +101,29 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents. (unpacked to B, C, layers+1, H, W)
+    """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -88,10 +139,21 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("height", required=True, type_hint=int),
-            InputParam("width", required=True, type_hint=int),
-            InputParam("layers", required=True, type_hint=int),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step.",
+            ),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("layers"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents", note="unpacked to B, C, layers+1, H, W"),
         ]
 
     @torch.no_grad()
@@ -112,7 +174,26 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
 
 
 # decode step
+
+
+# auto_docstring
 class QwenImageDecoderStep(ModularPipelineBlocks):
+    """
+    Step that decodes the latents to images
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -134,19 +215,13 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
                 name="latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The latents to decode, can be generated in the denoise step",
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
             ),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
-        return [
-            OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
-                description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
-            )
-        ]
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images", note="tensor output of the vae decoder.")]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -176,7 +251,26 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
+    """
+    Decode unpacked latents (B, C, layers+1, H, W) into layer images.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage-layered"
 
     @property
@@ -198,14 +292,19 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("latents", required=True, type_hint=torch.Tensor),
-            InputParam("output_type", default="pil", type_hint=str),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
+            ),
+            InputParam.template("output_type"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
         ]
 
     @torch.no_grad()
@@ -251,7 +350,27 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
 
 
 # postprocess the decoded images
+
+
+# auto_docstring
 class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -272,15 +391,19 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
             InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
             ),
+            InputParam.template("output_type"),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type):
         if output_type not in ["pil", "np", "pt"]:
@@ -301,7 +424,28 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
+    """
+    postprocess the generated image, optional apply the mask overally to the original image..
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          images (`Tensor`):
+              the generated image tensor from decoders step
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -322,16 +466,24 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("images", required=True, description="the generated image from decoders step"),
             InputParam(
-                name="output_type",
-                default="pil",
-                type_hint=str,
-                description="The type of the output images, can be 'pil', 'np', 'pt'",
+                name="images",
+                required=True,
+                type_hint=torch.Tensor,
+                description="the generated image tensor from decoders step",
+            ),
+            InputParam.template("output_type"),
+            InputParam(
+                name="mask_overlay_kwargs",
+                type_hint=Dict[str, Any],
+                description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
             ),
-            InputParam("mask_overlay_kwargs"),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [OutputParam.template("images")]
+
     @staticmethod
     def check_inputs(output_type, mask_overlay_kwargs):
         if output_type not in ["pil", "np", "pt"]:
diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py
index d6bcb4a94f..8579c9843a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/denoise.py
+++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py
@@ -50,7 +50,7 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                "latents",
+                name="latents",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
@@ -80,17 +80,12 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                "latents",
+                name="latents",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
-            ),
+            InputParam.template("image_latents"),
         ]
 
     @torch.no_grad()
@@ -134,29 +129,12 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
             ),
+            InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
             InputParam(
-                "controlnet_conditioning_scale",
-                type_hint=float,
-                description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "controlnet_keep",
+                name="controlnet_keep",
                 required=True,
                 type_hint=List[float],
-                description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description=(
-                    "All conditional model inputs for the denoiser. "
-                    "It should contain prompt_embeds/negative_prompt_embeds."
-                ),
+                description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
             ),
         ]
 
@@ -217,28 +195,13 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
                 type_hint=List[Tuple[int, int]],
-                description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
+                description="The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.",
             ),
         ]
 
@@ -317,23 +280,8 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("attention_kwargs"),
-            InputParam(
-                "latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
-            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("denoiser_input_fields"),
             InputParam(
                 "img_shapes",
                 required=True,
@@ -415,7 +363,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -456,24 +404,19 @@ class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "image_latents",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
-            ),
+            InputParam.template("image_latents"),
             InputParam(
                 "initial_noise",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
             ),
-            InputParam(
-                "timesteps",
-                required=True,
-                type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("latents"),
         ]
 
     @torch.no_grad()
@@ -515,17 +458,12 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
     def loop_inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                "timesteps",
+                name="timesteps",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
+            InputParam.template("num_inference_steps", required=True),
         ]
 
     @torch.no_grad()
@@ -557,7 +495,42 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
 
 
 # Qwen Image (text2image, image2image)
+
+
+# auto_docstring
 class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2image and image2image tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
 
     block_classes = [
@@ -570,8 +543,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
     @property
     def description(self) -> str:
         return (
-            "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
+            "Denoise step that iteratively denoise the latents.\n"
+            "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
             " - `QwenImageLoopBeforeDenoiser`\n"
             " - `QwenImageLoopDenoiser`\n"
@@ -581,7 +554,47 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -606,7 +619,47 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image (text2image, image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports text2img/img2img tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -631,7 +684,54 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageLoopBeforeDenoiser`
+       - `QwenImageLoopBeforeDenoiserControlNet`
+       - `QwenImageLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks with controlnet for QwenImage.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
+          (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          control_image_latents (`Tensor`):
+              The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning. (updated in prepare_controlnet_inputs step.)
+          controlnet_keep (`List`):
+              The controlnet keep values. Can be generated in prepare_controlnet_inputs step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageLoopBeforeDenoiser,
@@ -664,7 +764,42 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image Edit (image2image)
+# auto_docstring
 class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -687,7 +822,47 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image Edit (inpainting)
+# auto_docstring
 class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+       - `QwenImageLoopAfterDenoiserInpaint`
+      This block supports inpainting tasks for QwenImage Edit.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+          mask (`Tensor`):
+              The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.
+          initial_noise (`Tensor`):
+              The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
@@ -712,7 +887,42 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
 
 
 # Qwen Image Layered (image2image)
+# auto_docstring
 class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
+    """
+    Denoise step that iteratively denoise the latents.
+      Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
+      defined in `sub_blocks` sequencially:
+       - `QwenImageEditLoopBeforeDenoiser`
+       - `QwenImageEditLoopDenoiser`
+       - `QwenImageLoopAfterDenoiser`
+      This block supports QwenImage Layered.
+
+      Components:
+          guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`)
+
+      Inputs:
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          latents (`Tensor`):
+              The initial latents to use for the denoising process. Can be generated in prepare_latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          img_shapes (`List`):
+              The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageEditLoopBeforeDenoiser,
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index 4b66dd32e5..5e1821cca5 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -30,7 +30,7 @@ from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions
 from ...utils import logging
 from ...utils.torch_utils import unwrap_module
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import QwenImageModularPipeline
 from .prompt_templates import (
     QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE,
@@ -259,33 +259,47 @@ def encode_vae_image(
 # ====================
 # 1. RESIZE
 # ====================
+# In QwenImage pipelines, resize is a separate step because the resized image is used in VL encoding and vae encoder blocks:
+#
+#   image (PIL.Image.Image)
+#       │
+#       ▼
+#   resized_image ([PIL.Image.Image])
+#       │
+#       ├──► text_encoder ──► prompt_embeds, prompt_embeds_mask
+#       │    (VL encoding needs the resized image for vision-language fusion)
+#       │
+#       └──► image_processor ──► processed_image (torch.Tensor, pixel space)
+#                │
+#                ▼
+#            vae_encoder ──► image_latents (torch.Tensor, latent space)
+#
+# In most of our other pipelines, resizing is done as part of the image preprocessing step.
+# ====================
+
+
+# auto_docstring
 class QwenImageEditResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to target area while maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
+
     model_name = "qwenimage-edit"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to target area while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -300,17 +314,15 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
-            ),
-        ]
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="The resized images",
             ),
         ]
 
@@ -318,7 +330,7 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -334,38 +346,36 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
 
+# auto_docstring
 class QwenImageLayeredResizeStep(ModularPipelineBlocks):
+    """
+    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while
+    maintaining the aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+    """
+
     model_name = "qwenimage-layered"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-    ):
-        """Create a configurable step for resizing images to the target area while maintaining the aspect ratio.
-
-        Args:
-            input_name (str, optional): Name of the image field to read from the
-                pipeline state. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write
-                back to the pipeline state. Defaults to "resized_image".
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Image Resize step that resize the {self._image_input_name} to target area while maintaining the aspect ratio."
+        return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -381,9 +391,7 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(
-                name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
-            ),
+            InputParam.template("image"),
             InputParam(
                 name="resolution",
                 default=640,
@@ -396,8 +404,10 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
-            ),
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="The resized images",
+            )
         ]
 
     @staticmethod
@@ -411,7 +421,7 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
 
         self.check_inputs(resolution=block_state.resolution)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -428,45 +438,40 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
             for image in images
         ]
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
         self.set_block_state(state, block_state)
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
-    """Resize each image independently based on its own aspect ratio. For QwenImage Edit Plus."""
+    """
+    Resize images for QwenImage Edit Plus pipeline.
+      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text
+      encoding. Each image is resized independently based on its own aspect ratio.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+    """
 
     model_name = "qwenimage-edit-plus"
 
-    def __init__(
-        self,
-        input_name: str = "image",
-        output_name: str = "resized_image",
-        target_area: int = 1024 * 1024,
-    ):
-        """Create a step for resizing images to a target area.
-
-        Each image is resized independently based on its own aspect ratio. This is suitable for Edit Plus where
-        multiple reference images can have different dimensions.
-
-        Args:
-            input_name (str, optional): Name of the image field to read. Defaults to "image".
-            output_name (str, optional): Name of the resized image field to write. Defaults to "resized_image".
-            target_area (int, optional): Target area in pixels. Defaults to 1024*1024.
-        """
-        if not isinstance(input_name, str) or not isinstance(output_name, str):
-            raise ValueError(
-                f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
-            )
-        self._image_input_name = input_name
-        self._resized_image_output_name = output_name
-        self._target_area = target_area
-        super().__init__()
-
     @property
     def description(self) -> str:
         return (
-            f"Image Resize step that resizes {self._image_input_name} to target area {self._target_area}.\n"
+            "Resize images for QwenImage Edit Plus pipeline.\n"
+            "Produces two outputs: resized_image (1024x1024) for VAE encoding, "
+            "resized_cond_image (384x384) for VL text encoding.\n"
             "Each image is resized independently based on its own aspect ratio."
         )
 
@@ -483,20 +488,21 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [
-            InputParam(
-                name=self._image_input_name,
-                required=True,
-                type_hint=torch.Tensor,
-                description="The image(s) to resize",
-            ),
-        ]
+        # image
+        return [InputParam.template("image")]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
-                name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
+                name="resized_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 1024x1024 target area for VAE encoding",
+            ),
+            OutputParam(
+                name="resized_cond_image",
+                type_hint=List[PIL.Image.Image],
+                description="Images resized to 384x384 target area for VL text encoding",
             ),
         ]
 
@@ -504,7 +510,7 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         block_state = self.get_block_state(state)
 
-        images = getattr(block_state, self._image_input_name)
+        images = block_state.image
 
         if not is_valid_image_imagelist(images):
             raise ValueError(f"Images must be image or list of images but are {type(images)}")
@@ -514,16 +520,22 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
 
         # Resize each image independently based on its own aspect ratio
         resized_images = []
+        resized_cond_images = []
         for image in images:
             image_width, image_height = image.size
-            calculated_width, calculated_height, _ = calculate_dimensions(
-                self._target_area, image_width / image_height
-            )
-            resized_images.append(
-                components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
+
+            # For VAE encoder (1024x1024 target area)
+            vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
+            resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width))
+
+            # For VL text encoder (384x384 target area)
+            vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
+            resized_cond_images.append(
+                components.image_resize_processor.resize(image, height=vl_height, width=vl_width)
             )
 
-        setattr(block_state, self._resized_image_output_name, resized_images)
+        block_state.resized_image = resized_images
+        block_state.resized_cond_image = resized_cond_images
         self.set_block_state(state, block_state)
         return components, state
 
@@ -531,14 +543,38 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
 # ====================
 # 2. GET IMAGE PROMPT
 # ====================
+
+
+# auto_docstring
 class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
     """
-    Auto-caption step that generates a text prompt from the input image if none is provided. Uses the VL model to
-    generate a description of the image.
+    Auto-caption step that generates a text prompt from the input image if none is provided.
+      Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step
+      passes through unchanged.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          resized_image (`Image`):
+              The image to generate caption from, should be resized use the resize step
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+
+      Outputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
     """
 
     model_name = "qwenimage-layered"
 
+    def __init__(self):
+        self.image_caption_prompt_en = QWENIMAGE_LAYERED_CAPTION_PROMPT_EN
+        self.image_caption_prompt_cn = QWENIMAGE_LAYERED_CAPTION_PROMPT_CN
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -554,17 +590,12 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
             ComponentSpec("processor", Qwen2VLProcessor),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="image_caption_prompt_en", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_EN),
-            ConfigSpec(name="image_caption_prompt_cn", default=QWENIMAGE_LAYERED_CAPTION_PROMPT_CN),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", type_hint=str, description="The prompt to encode"),
+            InputParam.template(
+                "prompt", required=False
+            ),  # it is not required for qwenimage-layered, unlike other pipelines
             InputParam(
                 name="resized_image",
                 required=True,
@@ -579,6 +610,16 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
             ),
         ]
 
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt",
+                type_hint=str,
+                description="The prompt or prompts to guide image generation. If not provided, updated using image caption",
+            ),
+        ]
+
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
@@ -588,9 +629,9 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
         # If prompt is empty or None, generate caption from image
         if block_state.prompt is None or block_state.prompt == "" or block_state.prompt == " ":
             if block_state.use_en_prompt:
-                caption_prompt = components.config.image_caption_prompt_en
+                caption_prompt = self.image_caption_prompt_en
             else:
-                caption_prompt = components.config.image_caption_prompt_cn
+                caption_prompt = self.image_caption_prompt_cn
 
             model_inputs = components.processor(
                 text=caption_prompt,
@@ -616,9 +657,44 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
 # ====================
 # 3. TEXT ENCODER
 # ====================
+
+
+# auto_docstring
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that generates text embeddings to guide the image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
+
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_PROMPT_TEMPLATE_START_IDX
+        self.tokenizer_max_length = 1024
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that generates text embeddings to guide the image generation."
@@ -636,51 +712,21 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_PROMPT_TEMPLATE_START_IDX),
-            ConfigSpec(name="tokenizer_max_length", default=1024),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
-            InputParam(
-                name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024
-            ),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
+            InputParam.template("max_sequence_length", default=1024),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -709,9 +755,9 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
             components.text_encoder,
             components.tokenizer,
             prompt=block_state.prompt,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-            tokenizer_max_length=components.config.tokenizer_max_length,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+            tokenizer_max_length=self.tokenizer_max_length,
             device=device,
         )
 
@@ -726,9 +772,9 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
                 components.text_encoder,
                 components.tokenizer,
                 prompt=negative_prompt,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
-                tokenizer_max_length=components.config.tokenizer_max_length,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
+                tokenizer_max_length=self.tokenizer_max_length,
                 device=device,
             )
             block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[
@@ -742,9 +788,42 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
+    """
+    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image
+    generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_image (`Image`):
+              The image prompt to encode, should be resized using resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
+
     model_name = "qwenimage"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PROMPT_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation."
@@ -762,18 +841,11 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_image",
                 required=True,
@@ -785,30 +857,10 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -836,8 +888,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -850,8 +902,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
                 components.processor,
                 prompt=negative_prompt,
                 image=block_state.resized_image,
-                prompt_template_encode=components.config.prompt_template_encode,
-                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                prompt_template_encode=self.prompt_template_encode,
+                prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                 device=device,
             )
 
@@ -859,11 +911,44 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
-    """Text encoder for QwenImage Edit Plus (VL encoding with multiple images)."""
+    """
+    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text
+    embeddings for guiding image generation.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
+          (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          resized_cond_image (`Tensor`):
+              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using
+              resize step
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-edit-plus"
 
+    def __init__(self):
+        self.prompt_template_encode = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE
+        self.img_template_encode = QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE
+        self.prompt_template_encode_start_idx = QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX
+        super().__init__()
+
     @property
     def description(self) -> str:
         return (
@@ -884,19 +969,11 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
             ),
         ]
 
-    @property
-    def expected_configs(self) -> List[ConfigSpec]:
-        return [
-            ConfigSpec(name="prompt_template_encode", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE),
-            ConfigSpec(name="img_template_encode", default=QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE),
-            ConfigSpec(name="prompt_template_encode_start_idx", default=QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDX),
-        ]
-
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
-            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
             InputParam(
                 name="resized_cond_image",
                 required=True,
@@ -908,30 +985,10 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                name="prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The prompt embeddings",
-            ),
-            OutputParam(
-                name="prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The encoder attention mask",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings",
-            ),
-            OutputParam(
-                name="negative_prompt_embeds_mask",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="The negative prompt embeddings mask",
-            ),
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("prompt_embeds_mask"),
+            OutputParam.template("negative_prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @staticmethod
@@ -959,9 +1016,9 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
             components.processor,
             prompt=block_state.prompt,
             image=block_state.resized_cond_image,
-            prompt_template_encode=components.config.prompt_template_encode,
-            img_template_encode=components.config.img_template_encode,
-            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            prompt_template_encode=self.prompt_template_encode,
+            img_template_encode=self.img_template_encode,
+            prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
             device=device,
         )
 
@@ -975,9 +1032,9 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
                     components.processor,
                     prompt=negative_prompt,
                     image=block_state.resized_cond_image,
-                    prompt_template_encode=components.config.prompt_template_encode,
-                    img_template_encode=components.config.img_template_encode,
-                    prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                    prompt_template_encode=self.prompt_template_encode,
+                    img_template_encode=self.img_template_encode,
+                    prompt_template_encode_start_idx=self.prompt_template_encode_start_idx,
                     device=device,
                 )
             )
@@ -989,7 +1046,38 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
 # ====================
 # 4. IMAGE PREPROCESS
 # ====================
+
+
+# auto_docstring
 class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be
+    resized to the given height and width.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -1010,18 +1098,26 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("padding_mask_crop"),
+            InputParam.template("mask_image"),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            ),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1061,7 +1157,32 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be
+    resized first.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          resized_image (`Image`):
+              The resized image. should be generated using a resize step
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+    """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1082,16 +1203,25 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("mask_image", required=True),
-            InputParam("resized_image", required=True),
-            InputParam("padding_mask_crop"),
+            InputParam.template("mask_image"),
+            InputParam(
+                name="resized_image",
+                required=True,
+                type_hint=PIL.Image.Image,
+                description="The resized image. should be generated using a resize step",
+            ),
+            InputParam.template("padding_mask_crop"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(name="processed_image"),
-            OutputParam(name="processed_mask_image"),
+            OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"),
+            OutputParam(
+                name="processed_mask_image",
+                type_hint=torch.Tensor,
+                description="The processed mask image",
+            ),
             OutputParam(
                 name="mask_overlay_kwargs",
                 type_hint=Dict,
@@ -1119,7 +1249,27 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. will resize the image to the given height and width.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -1140,14 +1290,20 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("image", required=True),
-            InputParam("height"),
-            InputParam("width"),
+            InputParam.template("image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @staticmethod
     def check_inputs(height, width, vae_scale_factor):
@@ -1177,7 +1333,23 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images needs to be resized first.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
+
     model_name = "qwenimage-edit"
 
     @property
@@ -1198,12 +1370,23 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("resized_image", required=True),
+            InputParam(
+                name="resized_image",
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            ),
         ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1221,12 +1404,29 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
+    """
+    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of
+    processed images.
+
+      Components:
+          image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          resized_image (`List`):
+              The resized image. should be generated using a resize step
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+    """
+
     model_name = "qwenimage-edit-plus"
 
     @property
     def description(self) -> str:
-        return "Image Preprocess step. Images can be resized first using QwenImageEditResizeStep."
+        return "Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images."
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -1241,11 +1441,24 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("resized_image")]
+        return [
+            InputParam(
+                name="resized_image",
+                required=True,
+                type_hint=List[PIL.Image.Image],
+                description="The resized image. should be generated using a resize step",
+            )
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [OutputParam(name="processed_image")]
+        return [
+            OutputParam(
+                name="processed_image",
+                type_hint=torch.Tensor,
+                description="The processed image",
+            )
+        ]
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1263,7 +1476,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
             processed_images.append(
                 components.image_processor.preprocess(image=img, height=img_height, width=img_width)
             )
-        block_state.processed_image = processed_images
+
         if is_image_list:
             block_state.processed_image = processed_images
         else:
@@ -1276,15 +1489,34 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
 # ====================
 # 5. VAE ENCODER
 # ====================
+
+
+# auto_docstring
 class QwenImageVaeEncoderStep(ModularPipelineBlocks):
-    """VAE encoder that handles both single images and lists of images with varied resolutions."""
+    """
+    VAE Encoder step that converts processed_image into latent representations image_latents.
+      Handles both single images and lists of images with varied resolutions.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          processed_image (`Tensor`):
+              The image tensor to encode
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
 
     model_name = "qwenimage"
 
     def __init__(
         self,
-        input_name: str = "processed_image",
-        output_name: str = "image_latents",
+        input: Optional[InputParam] = None,
+        output: Optional[OutputParam] = None,
     ):
         """Initialize a VAE encoder step for converting images to latent representations.
 
@@ -1292,11 +1524,26 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
         a single tensor, outputs a single latent tensor.
 
         Args:
-            input_name (str, optional): Name of the input image tensor or list. Defaults to "processed_image".
-            output_name (str, optional): Name of the output latent tensor or list. Defaults to "image_latents".
+            input (InputParam, optional): Input parameter for the processed image. Defaults to "processed_image".
+            output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
         """
-        self._image_input_name = input_name
-        self._image_latents_output_name = output_name
+        if input is None:
+            input = InputParam(
+                name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode"
+            )
+
+        if output is None:
+            output = OutputParam.template("image_latents")
+
+        if not isinstance(input, InputParam):
+            raise ValueError(f"input must be InputParam but is {type(input)}")
+        if not isinstance(output, OutputParam):
+            raise ValueError(f"output must be OutputParam but is {type(output)}")
+
+        self._input = input
+        self._output = output
+        self._image_input_name = input.name
+        self._image_latents_output_name = output.name
         super().__init__()
 
     @property
@@ -1312,17 +1559,14 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam(self._image_input_name, required=True), InputParam("generator")]
+        return [
+            self._input,  # default is "processed_image"
+            InputParam.template("generator"),
+        ]
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
-            OutputParam(
-                self._image_latents_output_name,
-                type_hint=torch.Tensor,
-                description="The latents representing the reference image(s). Single tensor or list depending on input.",
-            )
-        ]
+        return [self._output]  # default is "image_latents"
 
     @torch.no_grad()
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1359,7 +1603,30 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
+    """
+    VAE Encoder step that converts `control_image` into latent representations control_image_latents.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
+
+      Inputs:
+          control_image (`Image`):
+              Control image for ControlNet conditioning.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -1383,10 +1650,10 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam("control_image", required=True),
-            InputParam("height"),
-            InputParam("width"),
-            InputParam("generator"),
+            InputParam.template("control_image"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("generator"),
         ]
         return inputs
 
@@ -1473,23 +1740,38 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
 # ====================
 # 6. PERMUTE LATENTS
 # ====================
+
+
+# auto_docstring
 class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
-    """Permute image latents from VAE format to Layered format."""
+    """
+    Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing.
+
+      Inputs:
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
+    """
 
     model_name = "qwenimage-layered"
 
-    def __init__(self, input_name: str = "image_latents"):
-        self._input_name = input_name
-        super().__init__()
-
     @property
     def description(self) -> str:
-        return f"Permute {self._input_name} from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
+        return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(self._input_name, required=True),
+            InputParam.template("image_latents"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam.template("image_latents", note="permuted from [B, C, 1, H, W] to [B, 1, C, H, W]"),
         ]
 
     @torch.no_grad()
@@ -1497,8 +1779,8 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
         block_state = self.get_block_state(state)
 
         # Permute: (B, C, 1, H, W) -> (B, 1, C, H, W)
-        latents = getattr(block_state, self._input_name)
-        setattr(block_state, self._input_name, latents.permute(0, 2, 1, 3, 4))
+        latents = block_state.image_latents
+        block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
 
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py
index 4a1cf3700c..818bbca5ed 100644
--- a/src/diffusers/modular_pipelines/qwenimage/inputs.py
+++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import torch
 
@@ -109,7 +109,44 @@ def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: in
     return height, width
 
 
+# auto_docstring
 class QwenImageTextInputsStep(ModularPipelineBlocks):
+    """
+    Text input processing step that standardizes text embeddings for the pipeline.
+      This step:
+        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
+        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
+
+      This block should be placed after all encoder steps to process the text embeddings before they are used in
+      subsequent pipeline steps.
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -129,26 +166,22 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
-            InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("prompt_embeds_mask"),
+            InputParam.template("negative_prompt_embeds"),
+            InputParam.template("negative_prompt_embeds_mask"),
         ]
 
     @property
-    def intermediate_outputs(self) -> List[str]:
+    def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                "batch_size",
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
-            ),
-            OutputParam(
-                "dtype",
-                type_hint=torch.dtype,
-                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
-            ),
+            OutputParam(name="batch_size", type_hint=int, description="The batch size of the prompt embeddings"),
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The data type of the prompt embeddings"),
+            OutputParam.template("prompt_embeds", note="batch-expanded"),
+            OutputParam.template("prompt_embeds_mask", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds", note="batch-expanded"),
+            OutputParam.template("negative_prompt_embeds_mask", note="batch-expanded"),
         ]
 
     @staticmethod
@@ -221,20 +254,76 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage: update height/width, expand batch, patchify."""
+    """
+    Input processing step that:
+        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
 
     model_name = "qwenimage"
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        # by default, process `image_latents`
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -252,9 +341,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -269,23 +358,19 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
-
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -298,11 +383,43 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
             ),
         ]
 
+        # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # image latent inputs are modified in place (patchified and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -331,7 +448,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -349,20 +467,76 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Edit Plus: handles list of latents with different sizes."""
+    """
+    Input processing step for Edit Plus that:
+        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+        Height/width defaults to last image in the list.
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """
 
     model_name = "qwenimage-edit-plus"
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -381,9 +555,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -398,23 +572,20 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
+            InputParam.template("height"),
+            InputParam.template("width"),
         ]
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        # default is `image_latents`
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=List[int],
@@ -427,11 +598,43 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
             ),
         ]
 
+        # `height`/`width` are updated if any image latent inputs are provided
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified, concatenated, and batch-expanded)",
+                )
+            )
+
+        # additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -476,7 +679,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
             setattr(block_state, image_latent_input_name, packed_image_latent_tensors)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -494,22 +698,75 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
         return components, state
 
 
-# YiYi TODO: support define config default component from the ModularPipeline level.
-# it is same as QwenImageAdditionalInputsStep, but with layered pachifier.
+# same as QwenImageAdditionalInputsStep, but with layered pachifier.
+
+
+# auto_docstring
 class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
-    """Input step for QwenImage Layered: update height/width, expand batch, patchify with layered pachifier."""
+    """
+    Input processing step for Layered that:
+        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
+           size
+        2. For additional batch inputs: Expands batch dimensions to match final batch size
+
+      Configured inputs:
+        - Image latent inputs: ['image_latents']
+
+      This block should be placed after the encoder steps and the text input step.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
+    """
 
     model_name = "qwenimage-layered"
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["image_latents"],
-        additional_batch_inputs: List[str] = [],
+        image_latent_inputs: Optional[List[InputParam]] = None,
+        additional_batch_inputs: Optional[List[InputParam]] = None,
     ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
         if not isinstance(image_latent_inputs, list):
-            image_latent_inputs = [image_latent_inputs]
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
         if not isinstance(additional_batch_inputs, list):
-            additional_batch_inputs = [additional_batch_inputs]
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
 
         self._image_latent_inputs = image_latent_inputs
         self._additional_batch_inputs = additional_batch_inputs
@@ -527,9 +784,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
         if self._image_latent_inputs or self._additional_batch_inputs:
             inputs_info = "\n\nConfigured inputs:"
             if self._image_latent_inputs:
-                inputs_info += f"\n  - Image latent inputs: {self._image_latent_inputs}"
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
             if self._additional_batch_inputs:
-                inputs_info += f"\n  - Additional batch inputs: {self._additional_batch_inputs}"
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
 
         placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
 
@@ -544,21 +801,18 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         inputs = [
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="batch_size", required=True),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("batch_size"),
         ]
+        # default is `image_latents`
 
-        for image_latent_input_name in self._image_latent_inputs:
-            inputs.append(InputParam(name=image_latent_input_name))
-
-        for input_name in self._additional_batch_inputs:
-            inputs.append(InputParam(name=input_name))
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
 
         return inputs
 
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
-        return [
+        outputs = [
             OutputParam(
                 name="image_height",
                 type_hint=int,
@@ -569,15 +823,44 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
                 type_hint=int,
                 description="The image width calculated from the image latents dimension",
             ),
-            OutputParam(name="height", type_hint=int, description="The height of the image output"),
-            OutputParam(name="width", type_hint=int, description="The width of the image output"),
         ]
 
+        if len(self._image_latent_inputs) > 0:
+            outputs.append(
+                OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
+            )
+            outputs.append(
+                OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
+            )
+
+        # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
+        for input_param in self._image_latent_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (patchified with layered pachifier and batch-expanded)",
+                )
+            )
+
+        # Add outputs for additional batch inputs (batch-expanded only)
+        for input_param in self._additional_batch_inputs:
+            outputs.append(
+                OutputParam(
+                    name=input_param.name,
+                    type_hint=input_param.type_hint,
+                    description=input_param.description + " (batch-expanded)",
+                )
+            )
+
+        return outputs
+
     def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
         # Process image latent inputs
-        for image_latent_input_name in self._image_latent_inputs:
+        for input_param in self._image_latent_inputs:
+            image_latent_input_name = input_param.name
             image_latent_tensor = getattr(block_state, image_latent_input_name)
             if image_latent_tensor is None:
                 continue
@@ -608,7 +891,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
             setattr(block_state, image_latent_input_name, image_latent_tensor)
 
         # Process additional batch inputs (only batch expansion)
-        for input_name in self._additional_batch_inputs:
+        for input_param in self._additional_batch_inputs:
+            input_name = input_param.name
             input_tensor = getattr(block_state, input_name)
             if input_tensor is None:
                 continue
@@ -626,7 +910,34 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+# auto_docstring
 class QwenImageControlNetInputsStep(ModularPipelineBlocks):
+    """
+    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.
+
+      Inputs:
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          batch_size (`int`, *optional*, defaults to 1):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
+              be generated in input step.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The control image latents (patchified and batch-expanded).
+          height (`int`):
+              if not provided, updated to control image height
+          width (`int`):
+              if not provided, updated to control image width
+    """
+
     model_name = "qwenimage"
 
     @property
@@ -636,11 +947,28 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="control_image_latents", required=True),
-            InputParam(name="batch_size", required=True),
-            InputParam(name="num_images_per_prompt", default=1),
-            InputParam(name="height"),
-            InputParam(name="width"),
+            InputParam(
+                name="control_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
+            ),
+            InputParam.template("batch_size"),
+            InputParam.template("num_images_per_prompt"),
+            InputParam.template("height"),
+            InputParam.template("width"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="control_image_latents",
+                type_hint=torch.Tensor,
+                description="The control image latents (patchified and batch-expanded).",
+            ),
+            OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
+            OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
         ]
 
     @torch.no_grad()
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index ebe0bbbd75..5837799d34 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
 import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageControlNetBeforeDenoiserStep,
     QwenImageCreateMaskLatentsStep,
@@ -59,11 +56,91 @@ logger = logging.get_logger(__name__)
 
 
 # ====================
-# 1. VAE ENCODER
+# 1. TEXT ENCODER
 # ====================
 
 
+# auto_docstring
+class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    """
+    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
+
+    model_name = "qwenimage"
+    block_classes = [QwenImageTextEncoderStep()]
+    block_names = ["text_encoder"]
+    block_trigger_inputs = ["prompt"]
+
+    @property
+    def description(self) -> str:
+        return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block."
+        " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided."
+        " - if `prompt` is not provided, step will be skipped."
+
+
+# ====================
+# 2. VAE ENCODER
+# ====================
+
+
+# auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    This step is used for processing image and mask inputs for inpainting tasks. It:
+       - Resizes the image to the target size, based on `height` and `width`.
+       - Processes and updates `image` and `mask_image`.
+       - Creates `image_latents`.
+
+      Components:
+          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          mask_image (`Image`):
+              Mask image for inpainting.
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
     model_name = "qwenimage"
     block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
     block_names = ["preprocess", "encode"]
@@ -78,7 +155,31 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
         )
 
 
+# auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
     model_name = "qwenimage"
 
     block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -89,7 +190,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
         return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
 
 
-# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
     block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
     block_names = ["inpaint", "img2img"]
@@ -107,7 +207,33 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
 
 
 # optional controlnet vae encoder
+# auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
+       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
+       - if `control_image` is not provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
+          (`VaeImageProcessor`)
+
+      Inputs:
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
+
     block_classes = [QwenImageControlNetVaeEncoderStep]
     block_names = ["controlnet"]
     block_trigger_inputs = ["control_image"]
@@ -123,14 +249,65 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
 
 
 # ====================
-# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
+# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise)
 # ====================
 
 
 # assemble input steps
+# auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the img2img denoising step. It:
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
+
     model_name = "qwenimage"
-    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
+    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep()]
     block_names = ["text_inputs", "additional_inputs"]
 
     @property
@@ -140,12 +317,69 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
         " - update height/width based `image_latents`, patchify `image_latents`."
 
 
+# auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the inpainting denoising step. It:
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -158,7 +392,42 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
 
 
 # assemble prepare latents steps
+# auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the pachified latents `mask` based on the processedmask image.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
     model_name = "qwenimage"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -176,7 +445,49 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 
 
 # Qwen Image (text2image)
+# auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -199,9 +510,63 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image (inpainting)
+# auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -226,9 +591,61 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image (image2image)
+# auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -253,9 +670,66 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image (text2image) with controlnet
+# auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageTextInputsStep(),
@@ -282,9 +756,72 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image (inpainting) with controlnet
+# auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageInpaintInputStep(),
@@ -313,9 +850,70 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image (image2image) with controlnet
+# auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
+          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          control_image_latents (`Tensor`):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage"
     block_classes = [
         QwenImageImg2ImgInputStep(),
@@ -344,6 +942,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -402,19 +1006,36 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
         ]
 
 
 # ====================
-# 3. DECODE
+# 4. DECODE
 # ====================
 
 
 # standard decode step works for most tasks except for inpaint
+# auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -425,7 +1046,30 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
 
 
 # Inpaint decode step
+# auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -452,11 +1096,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks):
 
 
 # ====================
-# 4. AUTO BLOCKS & PRESETS
+# 5. AUTO BLOCKS & PRESETS
 # ====================
 AUTO_BLOCKS = InsertableDict(
     [
-        ("text_encoder", QwenImageTextEncoderStep()),
+        ("text_encoder", QwenImageAutoTextEncoderStep()),
         ("vae_encoder", QwenImageAutoVaeEncoderStep()),
         ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
         ("denoise", QwenImageAutoCoreDenoiseStep()),
@@ -465,7 +1109,89 @@ AUTO_BLOCKS = InsertableDict(
 )
 
 
+# auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+      - for image-to-image generation, you need to provide `image`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
+      - to run the controlnet workflow, you need to provide `control_image`
+      - for text-to-image generation, all you need to provide is `prompt`
+
+      Components:
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
+          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
+          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+          image (`Union[Image, List]`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          control_image_latents (`Tensor`, *optional*):
+              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
+              step.
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage"
 
     block_classes = AUTO_BLOCKS.values()
@@ -476,7 +1202,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
         return (
             "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
             + "- for image-to-image generation, you need to provide `image`\n"
-            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+            + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
             + "- to run the controlnet workflow, you need to provide `control_image`\n"
             + "- for text-to-image generation, all you need to provide is `prompt`"
         )
@@ -484,5 +1210,5 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 2683e64080..e1e5c43354 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional
+from typing import Optional
 
-import PIL.Image
 import torch
 
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict, OutputParam
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
 from .before_denoise import (
     QwenImageCreateMaskLatentsStep,
     QwenImageEditRoPEInputsStep,
@@ -59,8 +58,35 @@ logger = logging.get_logger(__name__)
 # ====================
 
 
+# auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts."""
+    """
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-edit"
     block_classes = [
@@ -80,7 +106,30 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
 
 
 # Edit VAE encoder
+# auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
@@ -95,12 +144,46 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
 
 
 # Edit Inpaint VAE encoder
+# auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
+       - process the resized image and mask image.
+       - create image latents.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          mask_image (`Image`):
+              Mask image for inpainting.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          processed_mask_image (`Tensor`):
+              The processed mask image
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditResizeStep(),
         QwenImageEditInpaintProcessImagesInputStep(),
-        QwenImageVaeEncoderStep(input_name="processed_image", output_name="image_latents"),
+        QwenImageVaeEncoderStep(),
     ]
     block_names = ["resize", "preprocess", "encode"]
 
@@ -137,11 +220,64 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
 
 
 # assemble input steps
+# auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the edit denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -154,12 +290,71 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
         )
 
 
+# auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
+              batch-expanded)
+          processed_mask_image (`Tensor`):
+              The processed mask image (batch-expanded)
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageTextInputsStep(),
         QwenImageAdditionalInputsStep(
-            image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
+            additional_batch_inputs=[
+                InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
+            ]
         ),
     ]
     block_names = ["text_inputs", "additional_inputs"]
@@ -174,7 +369,42 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
 
 
 # assemble prepare latents steps
+# auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the patchified latents `mask` based on the processed mask image.
+
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
+              generated from vae encoder and updated in input step.)
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          dtype (`dtype`, *optional*, defaults to torch.float32):
+              The dtype of the model inputs, can be generated in input step.
+
+      Outputs:
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+          latents (`Tensor`):
+              The scaled noisy latents to use for inpainting/image-to-image denoising.
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
     block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -189,7 +419,50 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 
 
 # Qwen Image Edit (image2image) core denoise step
+# auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInputStep(),
@@ -212,9 +485,62 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit (img2img) task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Qwen Image Edit (inpainting) core denoise step
+# auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit edit inpaint task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [
         QwenImageEditInpaintInputStep(),
@@ -239,6 +565,12 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     def description(self):
         return "Core denoising workflow for QwenImage-Edit edit inpaint task."
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # Auto core denoise step for QwenImage Edit
 class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -267,6 +599,12 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
             "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit."
         )
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.template("latents"),
+        ]
+
 
 # ====================
 # 4. DECODE
@@ -274,7 +612,26 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
 
 
 # Decode step (standard)
+# auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -285,7 +642,30 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 
 
 # Inpaint decode step
+# auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage-edit"
     block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -313,9 +693,7 @@ class QwenImageEditAutoDecodeStep(AutoPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
         ]
 
 
@@ -333,7 +711,66 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 )
 
 
+# auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+      - for edit (img2img) generation, you need to provide `image`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
+          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          mask_image (`Image`, *optional*):
+              Mask image for inpainting.
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          processed_mask_image (`Tensor`, *optional*):
+              The processed mask image
+          latents (`Tensor`):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          mask_overlay_kwargs (`Dict`, *optional*):
+              The kwargs for the postprocess step to apply the mask overlay. generated in
+              InpaintProcessImagesInputStep.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage-edit"
     block_classes = EDIT_AUTO_BLOCKS.values()
     block_names = EDIT_AUTO_BLOCKS.keys()
@@ -349,5 +786,5 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 99c5b109bf..37656cef5d 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,12 +48,41 @@ logger = logging.get_logger(__name__)
 # ====================
 
 
+# auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
-    """VL encoder that takes both image and text prompts. Uses 384x384 target area."""
+    """
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=384 * 384, output_name="resized_cond_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusTextEncoderStep(),
     ]
     block_names = ["resize", "encode"]
@@ -73,12 +97,36 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # ====================
 
 
+# auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
-    """VAE encoder that handles multiple images with different sizes. Uses 1024x1024 target area."""
+    """
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              Images resized to 1024x1024 target area for VAE encoding
+          resized_cond_image (`List`):
+              Images resized to 384x384 target area for VL text encoding
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
 
     model_name = "qwenimage-edit-plus"
     block_classes = [
-        QwenImageEditPlusResizeStep(target_area=1024 * 1024, output_name="resized_image"),
+        QwenImageEditPlusResizeStep(),
         QwenImageEditPlusProcessImagesInputStep(),
         QwenImageVaeEncoderStep(),
     ]
@@ -98,11 +146,66 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 
 
 # assemble input steps
+# auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
+       - Standardizes text embeddings batch size.
+       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
+       - Outputs lists of image_height/image_width for RoPE calculation.
+       - Defaults height/width from last image in the list.
+
+      Components:
+          pachifier (`QwenImagePachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`List`):
+              The image heights calculated from the image latents dimension
+          image_width (`List`):
+              The image widths calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
+              concatenated, and batch-expanded)
+    """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageEditPlusAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageEditPlusAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -118,7 +221,50 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
 
 
 # Qwen Image Edit Plus (image2image) core denoise step
+# auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+
+      Components:
+          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [
         QwenImageEditPlusInputStep(),
@@ -144,9 +290,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
         ]
 
 
@@ -155,7 +299,26 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # ====================
 
 
+# auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
+    """
+    Decode step that decodes the latents to images and postprocesses the generated image.
+
+      Components:
+          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
+
+      Inputs:
+          latents (`Tensor`):
+              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
+              step.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images. (tensor output of the vae decoder.)
+    """
+
     model_name = "qwenimage-edit-plus"
     block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
     block_names = ["decode", "postprocess"]
@@ -179,7 +342,53 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 )
 
 
+# auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+      - `image` is required input (can be single image or list of images).
+      - Each image is resized independently based on its own aspect ratio.
+      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+          transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage-edit-plus"
     block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
     block_names = EDIT_PLUS_AUTO_BLOCKS.keys()
@@ -196,5 +405,5 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 63ee36df51..fdfeab0488 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-from typing import List
-
-import PIL.Image
-import torch
-
 from ...utils import logging
 from ..modular_pipeline import SequentialPipelineBlocks
 from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -55,8 +49,44 @@ logger = logging.get_logger(__name__)
 # ====================
 
 
+# auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
-    """Text encoder that takes text prompt, will generate a prompt based on image if not provided."""
+    """
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          prompt (`str`):
+              The prompt or prompts to guide image generation. If not provided, updated using image caption
+          prompt_embeds (`Tensor`):
+              The prompt embeddings.
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask.
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings.
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask.
+    """
 
     model_name = "qwenimage-layered"
     block_classes = [
@@ -77,7 +107,32 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
 
 
 # Edit VAE encoder
+# auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    Vae encoder step that encode the image inputs into their latent representations.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
+          (`AutoencoderKLQwenImage`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+          resized_image (`List`):
+              The resized images
+          processed_image (`Tensor`):
+              The processed image
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+    """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredResizeStep(),
@@ -98,11 +153,60 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
 
 
 # assemble input steps
+# auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
+    """
+    Input step that prepares the inputs for the layered denoising step. It:
+       - make sure the text embeddings have consistent batch size as well as the additional inputs.
+       - update height/width based `image_latents`, patchify `image_latents`.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+
+      Outputs:
+          batch_size (`int`):
+              The batch size of the prompt embeddings
+          dtype (`dtype`):
+              The data type of the prompt embeddings
+          prompt_embeds (`Tensor`):
+              The prompt embeddings. (batch-expanded)
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask. (batch-expanded)
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings. (batch-expanded)
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask. (batch-expanded)
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+          height (`int`):
+              if not provided, updated to image height
+          width (`int`):
+              if not provided, updated to image width
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
+              with layered pachifier and batch-expanded)
+    """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageTextInputsStep(),
-        QwenImageLayeredAdditionalInputsStep(image_latent_inputs=["image_latents"]),
+        QwenImageLayeredAdditionalInputsStep(),
     ]
     block_names = ["text_inputs", "additional_inputs"]
 
@@ -116,7 +220,48 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
 
 
 # Qwen Image Layered (image2image) core denoise step
+# auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for QwenImage-Layered img2img task.
+
+      Components:
+          pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
+          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          prompt_embeds_mask (`Tensor`):
+              mask for the text embeddings. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds_mask (`Tensor`, *optional*):
+              mask for the negative text embeddings. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
     model_name = "qwenimage-layered"
     block_classes = [
         QwenImageLayeredInputStep(),
@@ -142,9 +287,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(
-                name="latents", type_hint=torch.Tensor, description="The latents generated by the denoising step"
-            ),
+            OutputParam.template("latents"),
         ]
 
 
@@ -162,7 +305,54 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 )
 
 
+# auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+
+      Components:
+          image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
+          (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
+          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
+
+      Inputs:
+          image (`Union[Image, List]`):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          resolution (`int`, *optional*, defaults to 640):
+              The target area to resize the image to, can be 1024 or 640
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+          use_en_prompt (`bool`, *optional*, defaults to False):
+              Whether to use English prompt template
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          layers (`int`, *optional*, defaults to 4):
+              Number of layers to extract from the image
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          images (`List`):
+              Generated images.
+    """
+
     model_name = "qwenimage-layered"
     block_classes = LAYERED_AUTO_BLOCKS.values()
     block_names = LAYERED_AUTO_BLOCKS.keys()
@@ -174,5 +364,5 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     @property
     def outputs(self):
         return [
-            OutputParam(name="images", type_hint=List[List[PIL.Image.Image]], description="The generated images"),
+            OutputParam.template("images"),
         ]
diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py
index 3d5a00a9df..5f76a8459f 100644
--- a/src/diffusers/modular_pipelines/z_image/denoise.py
+++ b/src/diffusers/modular_pipelines/z_image/denoise.py
@@ -131,7 +131,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks):
             ),
             InputParam(
                 kwargs_type="denoiser_input_fields",
-                description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
+                description="The conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
             ),
         ]
         guider_input_names = []
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
new file mode 100644
index 0000000000..fc4a82f98e
--- /dev/null
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,352 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Auto Docstring Generator for Modular Pipeline Blocks
+
+This script scans Python files for classes that have `# auto_docstring` comment above them
+and inserts/updates the docstring from the class's `doc` property.
+
+Run from the root of the repo:
+    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
+
+Examples:
+    # Check for auto_docstring markers (will error if found without proper docstring)
+    python utils/modular_auto_docstring.py
+
+    # Check specific directory
+    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
+
+    # Fix and overwrite the docstrings
+    python utils/modular_auto_docstring.py --fix_and_overwrite
+
+Usage in code:
+    # auto_docstring
+    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+        # docstring will be automatically inserted here
+
+        @property
+        def doc(self):
+            return "Your docstring content..."
+"""
+
+import argparse
+import ast
+import glob
+import importlib
+import os
+import re
+import subprocess
+import sys
+
+
+# All paths are set with the intent you should run this script from the root of the repo
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+# Pattern to match the auto_docstring comment
+AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
+
+
+def setup_diffusers_import():
+    """Setup import path to use the local diffusers module."""
+    src_path = os.path.join(REPO_PATH, "src")
+    if src_path not in sys.path:
+        sys.path.insert(0, src_path)
+
+
+def get_module_from_filepath(filepath: str) -> str:
+    """Convert a filepath to a module name."""
+    filepath = os.path.normpath(filepath)
+
+    if filepath.startswith("src" + os.sep):
+        filepath = filepath[4:]
+
+    if filepath.endswith(".py"):
+        filepath = filepath[:-3]
+
+    module_name = filepath.replace(os.sep, ".")
+    return module_name
+
+
+def load_module(filepath: str):
+    """Load a module from filepath."""
+    setup_diffusers_import()
+    module_name = get_module_from_filepath(filepath)
+
+    try:
+        module = importlib.import_module(module_name)
+        return module
+    except Exception as e:
+        print(f"Warning: Could not import module {module_name}: {e}")
+        return None
+
+
+def get_doc_from_class(module, class_name: str) -> str:
+    """Get the doc property from an instantiated class."""
+    if module is None:
+        return None
+
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        return None
+
+    try:
+        instance = cls()
+        if hasattr(instance, "doc"):
+            return instance.doc
+    except Exception as e:
+        print(f"Warning: Could not instantiate {class_name}: {e}")
+
+    return None
+
+
+def find_auto_docstring_classes(filepath: str) -> list:
+    """
+    Find all classes in a file that have # auto_docstring comment above them.
+
+    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
+    """
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Parse AST to find class locations and their docstrings
+    content = "".join(lines)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        print(f"Syntax error in {filepath}: {e}")
+        return []
+
+    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
+    class_info = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            has_docstring = False
+            docstring_end_line = node.lineno  # default to class line
+
+            if node.body and isinstance(node.body[0], ast.Expr):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
+                    has_docstring = True
+                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
+
+            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
+
+    # Now scan for # auto_docstring comments
+    classes_to_update = []
+
+    for i, line in enumerate(lines):
+        if AUTO_DOCSTRING_PATTERN.match(line):
+            # Found the marker, look for class definition on next non-empty, non-comment line
+            j = i + 1
+            while j < len(lines):
+                next_line = lines[j].strip()
+                if next_line and not next_line.startswith("#"):
+                    break
+                j += 1
+
+            if j < len(lines) and lines[j].strip().startswith("class "):
+                # Extract class name
+                match = re.match(r"class\s+(\w+)", lines[j].strip())
+                if match:
+                    class_name = match.group(1)
+                    if class_name in class_info:
+                        class_line, has_docstring, docstring_end_line = class_info[class_name]
+                        classes_to_update.append((class_name, class_line, has_docstring, docstring_end_line))
+
+    return classes_to_update
+
+
+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
+def format_docstring(doc: str, indent: str = "    ") -> str:
+    """Format a doc string as a properly indented docstring."""
+    lines = doc.strip().split("\n")
+
+    if len(lines) == 1:
+        return f'{indent}"""{lines[0]}"""\n'
+    else:
+        result = [f'{indent}"""\n']
+        for line in lines:
+            if line.strip():
+                result.append(f"{indent}{line}\n")
+            else:
+                result.append("\n")
+        result.append(f'{indent}"""\n')
+        return "".join(result)
+
+
+def run_ruff_format(filepath: str):
+    """Run ruff check --fix, ruff format, and doc-builder style on a file to ensure consistent formatting."""
+    try:
+        # First run ruff check --fix to fix any linting issues (including line length)
+        subprocess.run(
+            ["ruff", "check", "--fix", filepath],
+            check=False,  # Don't fail if there are unfixable issues
+            capture_output=True,
+            text=True,
+        )
+        # Then run ruff format for code formatting
+        subprocess.run(
+            ["ruff", "format", filepath],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        # Finally run doc-builder style for docstring formatting
+        subprocess.run(
+            ["doc-builder", "style", filepath, "--max_len", "119"],
+            check=False,  # Don't fail if doc-builder has issues
+            capture_output=True,
+            text=True,
+        )
+        print(f"Formatted {filepath}")
+    except subprocess.CalledProcessError as e:
+        print(f"Warning: formatting failed for {filepath}: {e.stderr}")
+    except FileNotFoundError as e:
+        print(f"Warning: tool not found ({e}). Skipping formatting.")
+    except Exception as e:
+        print(f"Warning: unexpected error formatting {filepath}: {e}")
+
+
+def get_existing_docstring(lines: list, class_line: int, docstring_end_line: int) -> str:
+    """Extract the existing docstring content from lines."""
+    # class_line is 1-indexed, docstring starts at class_line (0-indexed: class_line)
+    # and ends at docstring_end_line (1-indexed, inclusive)
+    docstring_lines = lines[class_line:docstring_end_line]
+    return "".join(docstring_lines)
+
+
+def process_file(filepath: str, overwrite: bool = False) -> list:
+    """
+    Process a file and find/insert docstrings for # auto_docstring marked classes.
+
+    Returns list of classes that need updating.
+    """
+    classes_to_update = find_auto_docstring_classes(filepath)
+
+    if not classes_to_update:
+        return []
+
+    if not overwrite:
+        # Check mode: only verify that docstrings exist
+        # Content comparison is not reliable due to formatting differences
+        classes_needing_update = []
+        for class_name, class_line, has_docstring, docstring_end_line in classes_to_update:
+            if not has_docstring:
+                # No docstring exists, needs update
+                classes_needing_update.append((filepath, class_name, class_line))
+        return classes_needing_update
+
+    # Load the module to get doc properties
+    module = load_module(filepath)
+
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+
+    # Process in reverse order to maintain line numbers
+    updated = False
+    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
+        doc = get_doc_from_class(module, class_name)
+
+        if doc is None:
+            print(f"Warning: Could not get doc for {class_name} in {filepath}")
+            continue
+
+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
+        # Format the new docstring with 4-space indent
+        new_docstring = format_docstring(doc, "    ")
+
+        if has_docstring:
+            # Replace existing docstring (line after class definition to docstring_end_line)
+            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
+            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
+        else:
+            # Insert new docstring right after class definition line
+            # class_line is 1-indexed, so lines[class_line-1] is the class line
+            # Insert at position class_line (which is right after the class line)
+            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
+
+        updated = True
+        print(f"Updated docstring for {class_name} in {filepath}")
+
+    if updated:
+        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+        # Run ruff format to ensure consistent line wrapping
+        run_ruff_format(filepath)
+
+    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+
+def check_auto_docstrings(path: str = None, overwrite: bool = False):
+    """
+    Check all files for # auto_docstring markers and optionally fix them.
+    """
+    if path is None:
+        path = DIFFUSERS_PATH
+
+    if os.path.isfile(path):
+        all_files = [path]
+    else:
+        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
+
+    all_markers = []
+
+    for filepath in all_files:
+        markers = process_file(filepath, overwrite)
+        all_markers.extend(markers)
+
+    if not overwrite and len(all_markers) > 0:
+        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
+        raise ValueError(
+            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
+            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
+        )
+
+    if overwrite and len(all_markers) > 0:
+        print(f"\nProcessed {len(all_markers)} docstring(s).")
+    elif not overwrite and len(all_markers) == 0:
+        print("All # auto_docstring markers have valid docstrings.")
+    elif len(all_markers) == 0:
+        print("No # auto_docstring markers found.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and fix # auto_docstring markers in modular pipeline blocks",
+    )
+    parser.add_argument("path", nargs="?", default=None, help="File or directory to process (default: src/diffusers)")
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix the docstrings by inserting them from doc property.",
+    )
+
+    args = parser.parse_args()
+
+    check_auto_docstrings(args.path, args.fix_and_overwrite)

From d9959bd53ba954d3e9680f2666726589dcc2c260 Mon Sep 17 00:00:00 2001
From: Salman Chishti <salmanmkc@GitHub.com>
Date: Tue, 27 Jan 2026 06:22:50 +0000
Subject: [PATCH 02/10] Upgrade GitHub Actions to latest versions (#12866)

* Upgrade GitHub Actions to latest versions

Signed-off-by: Salman Muin Kayser Chishti <13schishti@gmail.com>

* fix: Correct GitHub Actions upgrade (fix branch refs and version formats)

* fix: Correct GitHub Actions upgrade (fix branch refs and version formats)

* fix: Correct GitHub Actions upgrade (fix branch refs and version formats)

---------

Signed-off-by: Salman Muin Kayser Chishti <13schishti@gmail.com>
---
 .github/workflows/build_docker_images.yml | 8 ++++----
 .github/workflows/typos.yml               | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml
index 1074a5bba6..f928e123aa 100644
--- a/.github/workflows/build_docker_images.yml
+++ b/.github/workflows/build_docker_images.yml
@@ -25,7 +25,7 @@ jobs:
     if: github.event_name == 'pull_request'
     steps:
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v3
 
       - name: Check out code
         uses: actions/checkout@v6
@@ -101,14 +101,14 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v6
       - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+        uses: docker/setup-buildx-action@v3
       - name: Login to Docker Hub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
         with:
           username: ${{ env.REGISTRY }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
       - name: Build and push
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v6
         with:
           no-cache: true
           context: ./docker/${{ matrix.image-name }}
diff --git a/.github/workflows/typos.yml b/.github/workflows/typos.yml
index 64bac65353..87ea38a5bb 100644
--- a/.github/workflows/typos.yml
+++ b/.github/workflows/typos.yml
@@ -11,4 +11,4 @@ jobs:
       - uses: actions/checkout@v6
 
       - name: typos-action
-        uses: crate-ci/typos@v1.12.4
+        uses: crate-ci/typos@v1.42.1

From 53279ef017e1d22a2c675332d1b6cd835c684ec7 Mon Sep 17 00:00:00 2001
From: Sam Edwards <samdwar@amazon.com>
Date: Tue, 27 Jan 2026 17:27:21 +1100
Subject: [PATCH 03/10] [From Single File] support `from_single_file` method
 for `WanAnimateTransformer3DModel` (#12691)

* Add `WanAnimateTransformer3DModel` to `SINGLE_FILE_LOADABLE_CLASSES`

* Fixed dtype mismatch when loading a single file

* Fixed a bug that results in white noise for generation

* Update dtype check for time embedder - caused white noise output

* Improve code readability

* Optimize dtype handling

Removed unnecessary dtype conversions for timestep and weight.

* Apply style fixes

* Refactor time embedding dtype handling

Adjust time embedding type conversion for compatibility.

* Apply style fixes

* Modify comment for WanTimeTextImageEmbedding class

---------

Co-authored-by: Sam Edwards <sam.edwards1976@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/diffusers/loaders/single_file_model.py    |  4 +
 src/diffusers/loaders/single_file_utils.py    | 83 +++++++++++++++++--
 .../transformers/transformer_wan_animate.py   | 17 ++--
 3 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/src/diffusers/loaders/single_file_model.py b/src/diffusers/loaders/single_file_model.py
index c733bd489d..ab4340fed1 100644
--- a/src/diffusers/loaders/single_file_model.py
+++ b/src/diffusers/loaders/single_file_model.py
@@ -152,6 +152,10 @@ SINGLE_FILE_LOADABLE_CLASSES = {
         "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
         "default_subfolder": "transformer",
     },
+    "WanAnimateTransformer3DModel": {
+        "checkpoint_mapping_fn": convert_wan_transformer_to_diffusers,
+        "default_subfolder": "transformer",
+    },
     "AutoencoderKLWan": {
         "checkpoint_mapping_fn": convert_wan_vae_to_diffusers,
         "default_subfolder": "vae",
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index 26f6c4388d..5e11acb51c 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -136,6 +136,7 @@ CHECKPOINT_KEY_NAMES = {
     "wan": ["model.diffusion_model.head.modulation", "head.modulation"],
     "wan_vae": "decoder.middle.0.residual.0.gamma",
     "wan_vace": "vace_blocks.0.after_proj.bias",
+    "wan_animate": "motion_encoder.dec.direction.weight",
     "hidream": "double_stream_blocks.0.block.adaLN_modulation.1.bias",
     "cosmos-1.0": [
         "net.x_embedder.proj.1.weight",
@@ -219,6 +220,7 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
     "wan-t2v-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"},
     "wan-t2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-T2V-14B-Diffusers"},
     "wan-i2v-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"},
+    "wan-animate-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.2-Animate-14B-Diffusers"},
     "wan-vace-1.3B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-VACE-1.3B-diffusers"},
     "wan-vace-14B": {"pretrained_model_name_or_path": "Wan-AI/Wan2.1-VACE-14B-diffusers"},
     "hidream": {"pretrained_model_name_or_path": "HiDream-ai/HiDream-I1-Dev"},
@@ -759,6 +761,9 @@ def infer_diffusers_model_type(checkpoint):
             elif checkpoint[target_key].shape[0] == 5120:
                 model_type = "wan-vace-14B"
 
+        if CHECKPOINT_KEY_NAMES["wan_animate"] in checkpoint:
+            model_type = "wan-animate-14B"
+
         elif checkpoint[target_key].shape[0] == 1536:
             model_type = "wan-t2v-1.3B"
         elif checkpoint[target_key].shape[0] == 5120 and checkpoint[target_key].shape[1] == 16:
@@ -3154,13 +3159,64 @@ def convert_sana_transformer_to_diffusers(checkpoint, **kwargs):
 
 
 def convert_wan_transformer_to_diffusers(checkpoint, **kwargs):
+    def generate_motion_encoder_mappings():
+        mappings = {
+            "motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
+            "motion_encoder.enc.net_app.convs.0.0.weight": "motion_encoder.conv_in.weight",
+            "motion_encoder.enc.net_app.convs.0.1.bias": "motion_encoder.conv_in.act_fn.bias",
+            "motion_encoder.enc.net_app.convs.8.weight": "motion_encoder.conv_out.weight",
+            "motion_encoder.enc.fc": "motion_encoder.motion_network",
+        }
+
+        for i in range(7):
+            conv_idx = i + 1
+            mappings.update(
+                {
+                    f"motion_encoder.enc.net_app.convs.{conv_idx}.conv1.0.weight": f"motion_encoder.res_blocks.{i}.conv1.weight",
+                    f"motion_encoder.enc.net_app.convs.{conv_idx}.conv1.1.bias": f"motion_encoder.res_blocks.{i}.conv1.act_fn.bias",
+                    f"motion_encoder.enc.net_app.convs.{conv_idx}.conv2.1.weight": f"motion_encoder.res_blocks.{i}.conv2.weight",
+                    f"motion_encoder.enc.net_app.convs.{conv_idx}.conv2.2.bias": f"motion_encoder.res_blocks.{i}.conv2.act_fn.bias",
+                    f"motion_encoder.enc.net_app.convs.{conv_idx}.skip.1.weight": f"motion_encoder.res_blocks.{i}.conv_skip.weight",
+                }
+            )
+
+        return mappings
+
+    def generate_face_adapter_mappings():
+        return {
+            "face_adapter.fuser_blocks": "face_adapter",
+            ".k_norm.": ".norm_k.",
+            ".q_norm.": ".norm_q.",
+            ".linear1_q.": ".to_q.",
+            ".linear2.": ".to_out.",
+            "conv1_local.conv": "conv1_local",
+            "conv2.conv": "conv2",
+            "conv3.conv": "conv3",
+        }
+
+    def split_tensor_handler(key, state_dict, split_pattern, target_keys):
+        tensor = state_dict.pop(key)
+        split_idx = tensor.shape[0] // 2
+
+        new_key_1 = key.replace(split_pattern, target_keys[0])
+        new_key_2 = key.replace(split_pattern, target_keys[1])
+
+        state_dict[new_key_1] = tensor[:split_idx]
+        state_dict[new_key_2] = tensor[split_idx:]
+
+    def reshape_bias_handler(key, state_dict):
+        if "motion_encoder.enc.net_app.convs." in key and ".bias" in key:
+            state_dict[key] = state_dict[key][0, :, 0, 0]
+
     converted_state_dict = {}
 
+    # Strip model.diffusion_model prefix
     keys = list(checkpoint.keys())
     for k in keys:
         if "model.diffusion_model." in k:
             checkpoint[k.replace("model.diffusion_model.", "")] = checkpoint.pop(k)
 
+    # Base transformer mappings
     TRANSFORMER_KEYS_RENAME_DICT = {
         "time_embedding.0": "condition_embedder.time_embedder.linear_1",
         "time_embedding.2": "condition_embedder.time_embedder.linear_2",
@@ -3182,28 +3238,43 @@ def convert_wan_transformer_to_diffusers(checkpoint, **kwargs):
         "ffn.0": "ffn.net.0.proj",
         "ffn.2": "ffn.net.2",
         # Hack to swap the layer names
-        # The original model calls the norms in following order: norm1, norm3, norm2
-        # We convert it to: norm1, norm2, norm3
         "norm2": "norm__placeholder",
         "norm3": "norm2",
         "norm__placeholder": "norm3",
-        # For the I2V model
+        # I2V model
         "img_emb.proj.0": "condition_embedder.image_embedder.norm1",
         "img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
         "img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
         "img_emb.proj.4": "condition_embedder.image_embedder.norm2",
-        # For the VACE model
+        # VACE model
         "before_proj": "proj_in",
         "after_proj": "proj_out",
     }
 
+    SPECIAL_KEYS_HANDLERS = {}
+    if any("face_adapter" in k for k in checkpoint.keys()):
+        TRANSFORMER_KEYS_RENAME_DICT.update(generate_face_adapter_mappings())
+        SPECIAL_KEYS_HANDLERS[".linear1_kv."] = (split_tensor_handler, [".to_k.", ".to_v."])
+
+    if any("motion_encoder" in k for k in checkpoint.keys()):
+        TRANSFORMER_KEYS_RENAME_DICT.update(generate_motion_encoder_mappings())
+
     for key in list(checkpoint.keys()):
-        new_key = key[:]
+        reshape_bias_handler(key, checkpoint)
+
+    for key in list(checkpoint.keys()):
+        new_key = key
         for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
             new_key = new_key.replace(replace_key, rename_key)
-
         converted_state_dict[new_key] = checkpoint.pop(key)
 
+    for key in list(converted_state_dict.keys()):
+        for pattern, (handler_fn, target_keys) in SPECIAL_KEYS_HANDLERS.items():
+            if pattern not in key:
+                continue
+            handler_fn(key, converted_state_dict, pattern, target_keys)
+            break
+
     return converted_state_dict
 
 
diff --git a/src/diffusers/models/transformers/transformer_wan_animate.py b/src/diffusers/models/transformers/transformer_wan_animate.py
index 8860f4bca9..538a029cd8 100644
--- a/src/diffusers/models/transformers/transformer_wan_animate.py
+++ b/src/diffusers/models/transformers/transformer_wan_animate.py
@@ -166,9 +166,11 @@ class MotionConv2d(nn.Module):
             # NOTE: the original implementation uses a 2D upfirdn operation with the upsampling and downsampling rates
             # set to 1, which should be equivalent to a 2D convolution
             expanded_kernel = self.blur_kernel[None, None, :, :].expand(self.in_channels, 1, -1, -1)
+            x = x.to(expanded_kernel.dtype)
             x = F.conv2d(x, expanded_kernel, padding=self.blur_padding, groups=self.in_channels)
 
         # Main Conv2D with scaling
+        x = x.to(self.weight.dtype)
         x = F.conv2d(x, self.weight * self.scale, bias=self.bias, stride=self.stride, padding=self.padding)
 
         # Activation with fused bias, if using
@@ -338,8 +340,7 @@ class WanAnimateMotionEncoder(nn.Module):
         weight = self.motion_synthesis_weight + 1e-8
         # Upcast the QR orthogonalization operation to FP32
         original_motion_dtype = motion_feat.dtype
-        motion_feat = motion_feat.to(torch.float32)
-        weight = weight.to(torch.float32)
+        motion_feat = motion_feat.to(weight.dtype)
 
         Q = torch.linalg.qr(weight)[0].to(device=motion_feat.device)
 
@@ -769,7 +770,7 @@ class WanImageEmbedding(torch.nn.Module):
         return hidden_states
 
 
-# Copied from diffusers.models.transformers.transformer_wan.WanTimeTextImageEmbedding
+# Modified from diffusers.models.transformers.transformer_wan.WanTimeTextImageEmbedding
 class WanTimeTextImageEmbedding(nn.Module):
     def __init__(
         self,
@@ -803,10 +804,12 @@ class WanTimeTextImageEmbedding(nn.Module):
         if timestep_seq_len is not None:
             timestep = timestep.unflatten(0, (-1, timestep_seq_len))
 
-        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
-        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
-            timestep = timestep.to(time_embedder_dtype)
-        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        if self.time_embedder.linear_1.weight.dtype.is_floating_point:
+            time_embedder_dtype = self.time_embedder.linear_1.weight.dtype
+        else:
+            time_embedder_dtype = encoder_hidden_states.dtype
+
+        temb = self.time_embedder(timestep.to(time_embedder_dtype)).type_as(encoder_hidden_states)
         timestep_proj = self.time_proj(self.act_fn(temb))
 
         encoder_hidden_states = self.text_embedder(encoder_hidden_states)

From 71a865b742e73c23ce9ac0414bcd95974561d43c Mon Sep 17 00:00:00 2001
From: Aditya Borate <23110065@iitgn.ac.in>
Date: Wed, 28 Jan 2026 03:50:44 +0530
Subject: [PATCH 04/10] Fix: Cosmos2.5 Video2World frame extraction and add
 default negative prompt (#13018)

* fix: Extract last frames for conditioning in Cosmos Video2World

* Added default negative prompt

* Apply style fixes

* Added default negative prompt in cosmos2 text2image pipeline

---------

Co-authored-by: YiYi Xu <yixu310@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../cosmos/pipeline_cosmos2_5_predict.py      | 45 ++++++++++++++++---
 .../cosmos/pipeline_cosmos2_text2image.py     | 10 ++++-
 .../cosmos/pipeline_cosmos2_video2world.py    | 10 ++++-
 .../cosmos/pipeline_cosmos_text2world.py      | 10 ++++-
 .../cosmos/pipeline_cosmos_video2world.py     | 10 ++++-
 5 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
index ea9df999dd..0f3f62551d 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -52,6 +52,15 @@ else:
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
+
 
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
@@ -359,7 +368,7 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
             prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
+            negative_prompt = negative_prompt if negative_prompt is not None else DEFAULT_NEGATIVE_PROMPT
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):
@@ -549,6 +558,7 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
         conditional_frame_timestep: float = 0.1,
+        num_latent_conditional_frames: int = 2,
     ):
         r"""
         The call function to the pipeline for generation. Supports three modes:
@@ -614,6 +624,10 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
             max_sequence_length (`int`, defaults to `512`):
                 The maximum number of tokens in the prompt. If the prompt exceeds this length, it will be truncated. If
                 the prompt is shorter than this length, it will be padded.
+            num_latent_conditional_frames (`int`, defaults to `2`):
+                Number of latent conditional frames to use for Video2World conditioning. The number of pixel frames
+                extracted from the input video is calculated as `4 * (num_latent_conditional_frames - 1) + 1`. Set to 1
+                for Image2World-like behavior (single frame conditioning).
 
         Examples:
 
@@ -692,19 +706,38 @@ class Cosmos2_5_PredictBasePipeline(DiffusionPipeline):
             video = torch.zeros(batch_size, num_frames, 3, height, width, dtype=torch.uint8)
             num_frames_in = 0
         else:
-            num_frames_in = len(video)
-
             if batch_size != 1:
                 raise ValueError(f"batch_size must be 1 for video input (given {batch_size})")
 
+            if num_latent_conditional_frames not in [1, 2]:
+                raise ValueError(
+                    f"num_latent_conditional_frames must be 1 or 2, but got {num_latent_conditional_frames}"
+                )
+
+            frames_to_extract = 4 * (num_latent_conditional_frames - 1) + 1
+
+            total_input_frames = len(video)
+
+            if total_input_frames < frames_to_extract:
+                raise ValueError(
+                    f"Input video has only {total_input_frames} frames but Video2World requires at least "
+                    f"{frames_to_extract} frames for conditioning."
+                )
+
+            num_frames_in = frames_to_extract
+
         assert video is not None
         video = self.video_processor.preprocess_video(video, height, width)
 
-        # pad with last frame (for video2world)
+        # For Video2World: extract last frames_to_extract frames from input, then pad
+        if image is None and num_frames_in > 0 and num_frames_in < video.shape[2]:
+            video = video[:, :, -num_frames_in:, :, :]
+
         num_frames_out = num_frames
+
         if video.shape[2] < num_frames_out:
-            n_pad_frames = num_frames_out - num_frames_in
-            last_frame = video[0, :, -1:, :, :]  # [C, T==1, H, W]
+            n_pad_frames = num_frames_out - video.shape[2]
+            last_frame = video[:, :, -1:, :, :]  # [B, C, T==1, H, W]
             pad_frames = last_frame.repeat(1, 1, n_pad_frames, 1, 1)  # [B, C, T, H, W]
             video = torch.cat((video, pad_frames), dim=2)
 
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
index 66490c2be1..d5ee609622 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
@@ -49,6 +49,14 @@ else:
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
 
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -300,7 +308,7 @@ class Cosmos2TextToImagePipeline(DiffusionPipeline):
             prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
+            negative_prompt = negative_prompt if negative_prompt is not None else DEFAULT_NEGATIVE_PROMPT
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py
index 23a74ad00f..b2c44d0d99 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_video2world.py
@@ -50,6 +50,14 @@ else:
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
 
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -319,7 +327,7 @@ class Cosmos2VideoToWorldPipeline(DiffusionPipeline):
             prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
+            negative_prompt = negative_prompt if negative_prompt is not None else DEFAULT_NEGATIVE_PROMPT
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py
index f0aa1ecf0e..676b7c7a72 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2world.py
@@ -49,6 +49,14 @@ else:
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
 
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -285,7 +293,7 @@ class CosmosTextToWorldPipeline(DiffusionPipeline):
             prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
+            negative_prompt = negative_prompt if negative_prompt is not None else DEFAULT_NEGATIVE_PROMPT
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
index cd5a734cc3..df507c3a4f 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py
@@ -50,6 +50,14 @@ else:
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
+DEFAULT_NEGATIVE_PROMPT = (
+    "The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+    "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+    "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+    "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "
+    "fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "
+    "Overall, the video is of poor quality."
+)
 
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -331,7 +339,7 @@ class CosmosVideoToWorldPipeline(DiffusionPipeline):
             prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
+            negative_prompt = negative_prompt if negative_prompt is not None else DEFAULT_NEGATIVE_PROMPT
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):

From 22ac6fae244e713f90ecf138ba7b1c219a5441d0 Mon Sep 17 00:00:00 2001
From: Jared Wen <w13431838023@gmail.com>
Date: Wed, 28 Jan 2026 06:22:02 +0800
Subject: [PATCH 05/10] [GLM-Image] Add batch support for GlmImagePipeline
 (#13007)

* init

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* change from right padding to left padding

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* try i2i batch

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix: revert i2i prior_token_image_ids to original 1D tensor format

* refactor KVCache for per prompt batching

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix KVCache

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix shape error

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* refactor pipeline

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix for left padding

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* insert seed to AR model

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* delete generator, use torch manual_seed

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* add batch processing unit tests for GlmImagePipeline

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* simplify normalize images method

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix grids_per_sample

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* fix t2i

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* delete comments, simplify condition statement

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* chage generate_prior_tokens outputs

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* simplify if logic

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* support user provided prior_token_ids directly

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* remove blank lines

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* align with transformers

Signed-off-by: JaredforReal <w13431838023@gmail.com>

* Apply style fixes

---------

Signed-off-by: JaredforReal <w13431838023@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../transformers/transformer_glm_image.py     |  91 +++-
 .../pipelines/glm_image/pipeline_glm_image.py | 460 +++++++++++++-----
 tests/pipelines/glm_image/test_glm_image.py   | 111 ++++-
 3 files changed, 514 insertions(+), 148 deletions(-)

diff --git a/src/diffusers/models/transformers/transformer_glm_image.py b/src/diffusers/models/transformers/transformer_glm_image.py
index b7b3aa391c..6f7ed2fca1 100644
--- a/src/diffusers/models/transformers/transformer_glm_image.py
+++ b/src/diffusers/models/transformers/transformer_glm_image.py
@@ -143,41 +143,86 @@ class GlmImageAdaLayerNormZero(nn.Module):
 
 
 class GlmImageLayerKVCache:
-    """KV cache for GlmImage model."""
+    """KV cache for GlmImage model.
+    Supports per-sample caching for batch processing where each sample may have different condition images.
+    """
 
     def __init__(self):
-        self.k_cache = None
-        self.v_cache = None
+        self.k_caches: List[Optional[torch.Tensor]] = []
+        self.v_caches: List[Optional[torch.Tensor]] = []
         self.mode: Optional[str] = None  # "write", "read", "skip"
+        self.current_sample_idx: int = 0  # Current sample index for writing
 
     def store(self, k: torch.Tensor, v: torch.Tensor):
-        if self.k_cache is None:
-            self.k_cache = k
-            self.v_cache = v
+        """Store KV cache for the current sample."""
+        # k, v shape: (1, seq_len, num_heads, head_dim)
+        if len(self.k_caches) <= self.current_sample_idx:
+            # First time storing for this sample
+            self.k_caches.append(k)
+            self.v_caches.append(v)
         else:
-            self.k_cache = torch.cat([self.k_cache, k], dim=1)
-            self.v_cache = torch.cat([self.v_cache, v], dim=1)
+            # Append to existing cache for this sample (multiple condition images)
+            self.k_caches[self.current_sample_idx] = torch.cat([self.k_caches[self.current_sample_idx], k], dim=1)
+            self.v_caches[self.current_sample_idx] = torch.cat([self.v_caches[self.current_sample_idx], v], dim=1)
 
     def get(self, k: torch.Tensor, v: torch.Tensor):
-        if self.k_cache.shape[0] != k.shape[0]:
-            k_cache_expanded = self.k_cache.expand(k.shape[0], -1, -1, -1)
-            v_cache_expanded = self.v_cache.expand(v.shape[0], -1, -1, -1)
-        else:
-            k_cache_expanded = self.k_cache
-            v_cache_expanded = self.v_cache
+        """Get combined KV cache for all samples in the batch.
 
-        k_cache = torch.cat([k_cache_expanded, k], dim=1)
-        v_cache = torch.cat([v_cache_expanded, v], dim=1)
-        return k_cache, v_cache
+        Args:
+            k: Current key tensor, shape (batch_size, seq_len, num_heads, head_dim)
+            v: Current value tensor, shape (batch_size, seq_len, num_heads, head_dim)
+        Returns:
+            Combined key and value tensors with cached values prepended.
+        """
+        batch_size = k.shape[0]
+        num_cached_samples = len(self.k_caches)
+        if num_cached_samples == 0:
+            return k, v
+        if num_cached_samples == 1:
+            # Single cache, expand for all batch samples (shared condition images)
+            k_cache_expanded = self.k_caches[0].expand(batch_size, -1, -1, -1)
+            v_cache_expanded = self.v_caches[0].expand(batch_size, -1, -1, -1)
+        elif num_cached_samples == batch_size:
+            # Per-sample cache, concatenate along batch dimension
+            k_cache_expanded = torch.cat(self.k_caches, dim=0)
+            v_cache_expanded = torch.cat(self.v_caches, dim=0)
+        else:
+            # Mismatch: try to handle by repeating the caches
+            # This handles cases like num_images_per_prompt > 1
+            repeat_factor = batch_size // num_cached_samples
+            if batch_size % num_cached_samples == 0:
+                k_cache_list = []
+                v_cache_list = []
+                for i in range(num_cached_samples):
+                    k_cache_list.append(self.k_caches[i].expand(repeat_factor, -1, -1, -1))
+                    v_cache_list.append(self.v_caches[i].expand(repeat_factor, -1, -1, -1))
+                k_cache_expanded = torch.cat(k_cache_list, dim=0)
+                v_cache_expanded = torch.cat(v_cache_list, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot match {num_cached_samples} cached samples to batch size {batch_size}. "
+                    f"Batch size must be a multiple of the number of cached samples."
+                )
+
+        k_combined = torch.cat([k_cache_expanded, k], dim=1)
+        v_combined = torch.cat([v_cache_expanded, v], dim=1)
+        return k_combined, v_combined
 
     def clear(self):
-        self.k_cache = None
-        self.v_cache = None
+        self.k_caches = []
+        self.v_caches = []
         self.mode = None
+        self.current_sample_idx = 0
+
+    def next_sample(self):
+        """Move to the next sample for writing."""
+        self.current_sample_idx += 1
 
 
 class GlmImageKVCache:
-    """Container for all layers' KV caches."""
+    """Container for all layers' KV caches.
+    Supports per-sample caching for batch processing where each sample may have different condition images.
+    """
 
     def __init__(self, num_layers: int):
         self.num_layers = num_layers
@@ -192,6 +237,12 @@ class GlmImageKVCache:
         for cache in self.caches:
             cache.mode = mode
 
+    def next_sample(self):
+        """Move to the next sample for writing. Call this after processing
+        all condition images for one batch sample."""
+        for cache in self.caches:
+            cache.next_sample()
+
     def clear(self):
         for cache in self.caches:
             cache.clear()
diff --git a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
index 5499b8769f..8cbaf99a22 100644
--- a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
+++ b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
@@ -260,25 +260,115 @@ class GlmImagePipeline(DiffusionPipeline):
         token_ids = token_ids.view(1, -1)
         return token_ids
 
+    @staticmethod
+    def _validate_and_normalize_images(
+        image: Optional[Union[List[PIL.Image.Image], List[List[PIL.Image.Image]]]],
+        batch_size: int,
+    ) -> Optional[List[List[PIL.Image.Image]]]:
+        """
+        Validate and normalize image inputs to List[List[PIL.Image]].
+
+        Rules:
+        - batch_size > 1: Only accepts List[List[PIL.Image]], each sublist must have equal length
+        - batch_size == 1: Accepts List[PIL.Image] for legacy compatibility (converted to [[img1, img2, ...]])
+        - Other formats raise ValueError
+
+        Args:
+            image: Input images in various formats
+            batch_size: Number of prompts in the batch
+
+        Returns:
+            Normalized images as List[List[PIL.Image]], or None if no images provided
+        """
+        if image is None or len(image) == 0:
+            return None
+
+        first_element = image[0]
+
+        if batch_size == 1:
+            # Legacy format: List[PIL.Image] -> [[img1, img2, ...]]
+            if not isinstance(first_element, (list, tuple)):
+                return [list(image)]
+            # Already in List[List[PIL.Image]] format
+            if len(image) != 1:
+                raise ValueError(
+                    f"For batch_size=1 with List[List[PIL.Image]] format, expected 1 image list, got {len(image)}."
+                )
+            return [list(image[0])]
+
+        # batch_size > 1: must be List[List[PIL.Image]]
+        if not isinstance(first_element, (list, tuple)):
+            raise ValueError(
+                f"For batch_size > 1, images must be List[List[PIL.Image]] format. "
+                f"Got List[{type(first_element).__name__}] instead. "
+                f"Each prompt requires its own list of condition images."
+            )
+
+        if len(image) != batch_size:
+            raise ValueError(f"Number of image lists ({len(image)}) must match batch size ({batch_size}).")
+
+        # Validate homogeneous: all sublists must have same length
+        num_input_images_per_prompt = len(image[0])
+        for idx, imgs in enumerate(image):
+            if len(imgs) != num_input_images_per_prompt:
+                raise ValueError(
+                    f"All prompts must have the same number of condition images. "
+                    f"Prompt 0 has {num_input_images_per_prompt} images, but prompt {idx} has {len(imgs)} images."
+                )
+
+        return [list(imgs) for imgs in image]
+
     def generate_prior_tokens(
         self,
-        prompt: str,
+        prompt: Union[str, List[str]],
         height: int,
         width: int,
-        image: Optional[List[PIL.Image.Image]] = None,
+        image: Optional[List[List[PIL.Image.Image]]] = None,
         device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
     ):
+        """
+        Generate prior tokens for the DiT model using the AR model.
+
+        Args:
+            prompt: Single prompt or list of prompts
+            height: Target image height
+            width: Target image width
+            image: Normalized image input as List[List[PIL.Image]]. Should be pre-validated
+                   using _validate_and_normalize_images() before calling this method.
+            device: Target device
+            generator: Random generator for reproducibility
+
+        Returns:
+            Tuple of:
+                - prior_token_ids: Tensor of shape (batch_size, num_tokens) with upsampled prior tokens
+                - prior_token_image_ids_per_sample: List of tensors, one per sample. Each tensor contains
+                    the upsampled prior token ids for all condition images in that sample. None for t2i.
+                - source_image_grid_thw_per_sample: List of tensors, one per sample. Each tensor has shape
+                    (num_condition_images, 3) with upsampled grid info. None for t2i.
+        """
         device = device or self._execution_device
-        is_text_to_image = image is None or len(image) == 0
-        content = []
-        if image is not None:
-            for img in image:
-                content.append({"type": "image", "image": img})
-        content.append({"type": "text", "text": prompt})
-        messages = [{"role": "user", "content": content}]
+
+        # Normalize prompt to list format
+        prompt_list = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt_list)
+
+        # Image is already normalized by _validate_and_normalize_images(): None or List[List[PIL.Image]]
+        is_text_to_image = image is None
+        # Build messages for each sample in the batch
+        all_messages = []
+        for idx, p in enumerate(prompt_list):
+            content = []
+            if not is_text_to_image:
+                for img in image[idx]:
+                    content.append({"type": "image", "image": img})
+            content.append({"type": "text", "text": p})
+            all_messages.append([{"role": "user", "content": content}])
+        # Process with the processor (supports batch with left padding)
         inputs = self.processor.apply_chat_template(
-            messages,
+            all_messages,
             tokenize=True,
+            padding=True if batch_size > 1 else False,
             target_h=height,
             target_w=width,
             return_dict=True,
@@ -286,44 +376,117 @@ class GlmImagePipeline(DiffusionPipeline):
         ).to(device)
 
         image_grid_thw = inputs.get("image_grid_thw")
+        images_per_sample = inputs.get("images_per_sample")
+
+        # Determine number of condition images and grids per sample
+        num_condition_images = 0 if is_text_to_image else len(image[0])
+        if images_per_sample is not None:
+            num_grids_per_sample = images_per_sample[0].item()
+        else:
+            # Fallback for batch_size=1: total grids is for single sample
+            num_grids_per_sample = image_grid_thw.shape[0]
+
+        # Compute generation params (same for all samples in homogeneous batch)
+        first_sample_grids = image_grid_thw[:num_grids_per_sample]
         max_new_tokens, large_image_offset, token_h, token_w = self._compute_generation_params(
-            image_grid_thw=image_grid_thw, is_text_to_image=is_text_to_image
+            image_grid_thw=first_sample_grids, is_text_to_image=is_text_to_image
         )
 
+        # Generate source image tokens (prior_token_image_ids) for i2i mode
         prior_token_image_ids = None
-        if image is not None:
-            prior_token_image_embed = self.vision_language_encoder.get_image_features(
-                inputs["pixel_values"], image_grid_thw[:-1]
-            )
-            prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
-            prior_token_image_ids = self.vision_language_encoder.get_image_tokens(
-                prior_token_image_embed, image_grid_thw[:-1]
-            )
+        source_image_grid_thw = None
+        if not is_text_to_image:
+            # Extract source grids by selecting condition image indices (skip target grids)
+            # Grid order from processor: [s0_cond1, s0_cond2, ..., s0_target, s1_cond1, s1_cond2, ..., s1_target, ...]
+            # We need indices: [0, 1, ..., num_condition_images-1, num_grids_per_sample, num_grids_per_sample+1, ...]
+            source_indices = []
+            for sample_idx in range(batch_size):
+                base = sample_idx * num_grids_per_sample
+                source_indices.extend(range(base, base + num_condition_images))
+            source_grids = image_grid_thw[source_indices]
 
-        # For GLM-Image, greedy decoding is not allowed; it may cause repetitive outputs.
-        # max_new_tokens must be exactly grid_h * grid_w + 1 (the +1 is for EOS).
+            if len(source_grids) > 0:
+                prior_token_image_embed = self.vision_language_encoder.get_image_features(
+                    inputs["pixel_values"], source_grids, return_dict=False
+                )
+                prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0)
+                prior_token_image_ids_d32 = self.vision_language_encoder.get_image_tokens(
+                    prior_token_image_embed, source_grids
+                )
+                # Upsample each source image's prior tokens to match VAE/DiT resolution
+                split_sizes = source_grids.prod(dim=-1).tolist()
+                prior_ids_per_source = torch.split(prior_token_image_ids_d32, split_sizes)
+                upsampled_prior_ids = []
+                for i, prior_ids in enumerate(prior_ids_per_source):
+                    t, h, w = source_grids[i].tolist()
+                    upsampled = self._upsample_token_ids(prior_ids, int(h), int(w))
+                    upsampled_prior_ids.append(upsampled.squeeze(0))
+                prior_token_image_ids = torch.cat(upsampled_prior_ids, dim=0)
+                # Upsample grid dimensions for later splitting
+                upsampled_grids = source_grids.clone()
+                upsampled_grids[:, 1] = upsampled_grids[:, 1] * 2
+                upsampled_grids[:, 2] = upsampled_grids[:, 2] * 2
+                source_image_grid_thw = upsampled_grids
+
+        # Generate with AR model
+        # Set torch random seed from generator for reproducibility
+        # (transformers generate() doesn't accept generator parameter)
+        if generator is not None:
+            seed = generator.initial_seed()
+            torch.manual_seed(seed)
+            if device is not None and device.type == "cuda":
+                torch.cuda.manual_seed(seed)
         outputs = self.vision_language_encoder.generate(
             **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=True,
         )
 
-        prior_token_ids_d32 = self._extract_large_image_tokens(
-            outputs, inputs["input_ids"].shape[-1], large_image_offset, token_h * token_w
-        )
-        prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w)
+        # Extract and upsample prior tokens for each sample
+        # For left-padded inputs, generated tokens start after the padded input sequence
+        all_prior_token_ids = []
+        max_input_length = inputs["input_ids"].shape[-1]
+        for idx in range(batch_size):
+            # For left-padded sequences, generated tokens start at max_input_length
+            # (padding is on the left, so all sequences end at the same position)
+            prior_token_ids_d32 = self._extract_large_image_tokens(
+                outputs[idx : idx + 1], max_input_length, large_image_offset, token_h * token_w
+            )
+            prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w)
+            all_prior_token_ids.append(prior_token_ids)
+        prior_token_ids = torch.cat(all_prior_token_ids, dim=0)
 
-        return prior_token_ids, prior_token_image_ids
+        # Split prior_token_image_ids and source_image_grid_thw into per-sample lists for easier consumption
+        prior_token_image_ids_per_sample = None
+        source_image_grid_thw_per_sample = None
+        if prior_token_image_ids is not None and source_image_grid_thw is not None:
+            # Split grids: each sample has num_condition_images grids
+            source_image_grid_thw_per_sample = list(torch.split(source_image_grid_thw, num_condition_images))
+            # Split prior_token_image_ids: tokens per sample may vary due to different image sizes
+            tokens_per_image = source_image_grid_thw.prod(dim=-1).tolist()
+            tokens_per_sample = []
+            for i in range(batch_size):
+                start_idx = i * num_condition_images
+                end_idx = start_idx + num_condition_images
+                tokens_per_sample.append(sum(tokens_per_image[start_idx:end_idx]))
+            prior_token_image_ids_per_sample = list(torch.split(prior_token_image_ids, tokens_per_sample))
+
+        return prior_token_ids, prior_token_image_ids_per_sample, source_image_grid_thw_per_sample
 
     def get_glyph_texts(self, prompt):
-        prompt = prompt[0] if isinstance(prompt, list) else prompt
-        ocr_texts = (
-            re.findall(r"'([^']*)'", prompt)
-            + re.findall(r"“([^“”]*)”", prompt)
-            + re.findall(r'"([^"]*)"', prompt)
-            + re.findall(r"「([^「」]*)」", prompt)
-        )
-        return ocr_texts
+        """Extract glyph texts from prompt(s). Returns a list of lists for batch processing."""
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        all_ocr_texts = []
+        for p in prompt:
+            ocr_texts = (
+                re.findall(r"'([^']*)'", p)
+                + re.findall(r"\u201c([^\u201c\u201d]*)\u201d", p)
+                + re.findall(r'"([^"]*)"', p)
+                + re.findall(r"「([^「」]*)」", p)
+            )
+            all_ocr_texts.append(ocr_texts)
+        return all_ocr_texts
 
     def _get_glyph_embeds(
         self,
@@ -332,29 +495,51 @@ class GlmImagePipeline(DiffusionPipeline):
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
     ):
+        """Get glyph embeddings for each prompt in the batch."""
         device = device or self._execution_device
         dtype = dtype or self.text_encoder.dtype
 
-        glyph_texts = self.get_glyph_texts(prompt)
-        input_ids = self.tokenizer(
-            glyph_texts if len(glyph_texts) > 0 else [""],
-            max_length=max_sequence_length,
-            truncation=True,
-        ).input_ids
-        input_ids = [
-            [self.tokenizer.pad_token_id] * ((len(input_ids) + 1) % 2) + input_ids_ for input_ids_ in input_ids
-        ]
-        max_length = max(len(input_ids_) for input_ids_ in input_ids)
-        attention_mask = torch.tensor(
-            [[1] * len(input_ids_) + [0] * (max_length - len(input_ids_)) for input_ids_ in input_ids], device=device
-        )
-        input_ids = torch.tensor(
-            [input_ids_ + [self.tokenizer.pad_token_id] * (max_length - len(input_ids_)) for input_ids_ in input_ids],
-            device=device,
-        )
-        outputs = self.text_encoder(input_ids, attention_mask=attention_mask)
-        glyph_embeds = outputs.last_hidden_state[attention_mask.bool()].unsqueeze(0)
+        # get_glyph_texts now returns a list of lists (one per prompt)
+        all_glyph_texts = self.get_glyph_texts(prompt)
 
+        all_glyph_embeds = []
+        for glyph_texts in all_glyph_texts:
+            if len(glyph_texts) == 0:
+                glyph_texts = [""]
+            input_ids = self.tokenizer(
+                glyph_texts,
+                max_length=max_sequence_length,
+                truncation=True,
+            ).input_ids
+            input_ids = [
+                [self.tokenizer.pad_token_id] * ((len(input_ids) + 1) % 2) + input_ids_ for input_ids_ in input_ids
+            ]
+            max_length = max(len(input_ids_) for input_ids_ in input_ids)
+            attention_mask = torch.tensor(
+                [[1] * len(input_ids_) + [0] * (max_length - len(input_ids_)) for input_ids_ in input_ids],
+                device=device,
+            )
+            input_ids = torch.tensor(
+                [
+                    input_ids_ + [self.tokenizer.pad_token_id] * (max_length - len(input_ids_))
+                    for input_ids_ in input_ids
+                ],
+                device=device,
+            )
+            outputs = self.text_encoder(input_ids, attention_mask=attention_mask)
+            glyph_embeds = outputs.last_hidden_state[attention_mask.bool()].unsqueeze(0)
+            all_glyph_embeds.append(glyph_embeds)
+
+        # Pad to same sequence length and stack (use left padding to match transformers)
+        max_seq_len = max(emb.size(1) for emb in all_glyph_embeds)
+        padded_embeds = []
+        for emb in all_glyph_embeds:
+            if emb.size(1) < max_seq_len:
+                pad = torch.zeros(emb.size(0), max_seq_len - emb.size(1), emb.size(2), device=device, dtype=emb.dtype)
+                emb = torch.cat([pad, emb], dim=1)  # left padding
+            padded_embeds.append(emb)
+
+        glyph_embeds = torch.cat(padded_embeds, dim=0)
         return glyph_embeds.to(device=device, dtype=dtype)
 
     def encode_prompt(
@@ -399,9 +584,9 @@ class GlmImagePipeline(DiffusionPipeline):
         if prompt_embeds is None:
             prompt_embeds = self._get_glyph_embeds(prompt, max_sequence_length, device, dtype)
 
-        seq_len = prompt_embeds.size(1)
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        # Repeat embeddings for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
         # For GLM-Image, negative_prompt must be "" instead of None
         if do_classifier_free_guidance and negative_prompt_embeds is None:
@@ -409,9 +594,8 @@ class GlmImagePipeline(DiffusionPipeline):
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
             negative_prompt_embeds = self._get_glyph_embeds(negative_prompt, max_sequence_length, device, dtype)
 
-            seq_len = negative_prompt_embeds.size(1)
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            if num_images_per_prompt > 1:
+                negative_prompt_embeds = negative_prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
 
         return prompt_embeds, negative_prompt_embeds
 
@@ -442,7 +626,9 @@ class GlmImagePipeline(DiffusionPipeline):
         prompt_embeds=None,
         negative_prompt_embeds=None,
         prior_token_ids=None,
-        prior_image_token_ids=None,
+        prior_token_image_ids=None,
+        source_image_grid_thw=None,
+        image=None,
     ):
         if (
             height is not None
@@ -488,12 +674,24 @@ class GlmImagePipeline(DiffusionPipeline):
                     f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
                     f" {negative_prompt_embeds.shape}."
                 )
-        if (prior_token_ids is None and prior_image_token_ids is not None) or (
-            prior_token_ids is not None and prior_image_token_ids is None
-        ):
+        # Validate prior token inputs: for i2i mode, all three must be provided together
+        # For t2i mode, only prior_token_ids is needed (prior_token_image_ids and source_image_grid_thw should be None)
+        prior_image_inputs = [prior_token_image_ids, source_image_grid_thw]
+        num_prior_image_inputs = sum(x is not None for x in prior_image_inputs)
+        if num_prior_image_inputs > 0 and num_prior_image_inputs < len(prior_image_inputs):
             raise ValueError(
-                f"Cannot forward only one `prior_token_ids`: {prior_token_ids} or `prior_image_token_ids`:"
-                f" {prior_image_token_ids} provided. Please make sure both are provided or neither."
+                "`prior_token_image_ids` and `source_image_grid_thw` must be provided together for i2i mode. "
+                f"Got prior_token_image_ids={prior_token_image_ids is not None}, "
+                f"source_image_grid_thw={source_image_grid_thw is not None}."
+            )
+        if num_prior_image_inputs > 0 and prior_token_ids is None:
+            raise ValueError(
+                "`prior_token_ids` must be provided when `prior_token_image_ids` and `source_image_grid_thw` are provided."
+            )
+        if num_prior_image_inputs > 0 and image is None:
+            raise ValueError(
+                "`image` must be provided when `prior_token_image_ids` and `source_image_grid_thw` are provided "
+                "for i2i mode, as the images are needed for VAE encoding to build the KV cache."
             )
 
         if prior_token_ids is not None and prompt_embeds is None:
@@ -545,7 +743,8 @@ class GlmImagePipeline(DiffusionPipeline):
         prompt_embeds: Optional[torch.Tensor] = None,
         negative_prompt_embeds: Optional[torch.Tensor] = None,
         prior_token_ids: Optional[torch.FloatTensor] = None,
-        prior_image_token_ids: Optional[torch.Tensor] = None,
+        prior_token_image_ids: Optional[List[torch.Tensor]] = None,
+        source_image_grid_thw: Optional[List[torch.Tensor]] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         output_type: str = "pil",
         return_dict: bool = True,
@@ -598,7 +797,9 @@ class GlmImagePipeline(DiffusionPipeline):
             prompt_embeds,
             negative_prompt_embeds,
             prior_token_ids,
-            prior_image_token_ids,
+            prior_token_image_ids,
+            source_image_grid_thw,
+            image,
         )
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
@@ -611,34 +812,47 @@ class GlmImagePipeline(DiffusionPipeline):
             batch_size = len(prompt)
         else:
             batch_size = prompt_embeds.shape[0]
-        if batch_size != 1:
-            raise ValueError(f"batch_size must be 1 due to AR model limitations, got {batch_size}")
 
         device = self._execution_device
 
-        # 2. Preprocess image tokens and prompt tokens
-        if prior_token_ids is None:
-            prior_token_ids, prior_token_image_ids = self.generate_prior_tokens(
-                prompt=prompt[0] if isinstance(prompt, list) else prompt,
-                image=image,
-                height=height,
-                width=width,
-                device=device,
-            )
+        # 2. Validate and normalize image format
+        normalized_image = self._validate_and_normalize_images(image, batch_size)
 
-        # 3. Preprocess image
-        if image is not None:
-            preprocessed_condition_images = []
-            for img in image:
-                image_height, image_width = img.size[::-1] if isinstance(img, PIL.Image.Image) else img.shape[:2]
-                multiple_of = self.vae_scale_factor * self.transformer.config.patch_size
-                image_height = (image_height // multiple_of) * multiple_of
-                image_width = (image_width // multiple_of) * multiple_of
-                img = self.image_processor.preprocess(img, height=image_height, width=image_width)
-                preprocessed_condition_images.append(img)
-                height = height or image_height
-                width = width or image_width
-            image = preprocessed_condition_images
+        # 3. Generate prior tokens (batch mode)
+        # Get a single generator for AR model (use first if list provided)
+        ar_generator = generator[0] if isinstance(generator, list) else generator
+        if prior_token_ids is None:
+            prior_token_ids, prior_token_image_ids_per_sample, source_image_grid_thw_per_sample = (
+                self.generate_prior_tokens(
+                    prompt=prompt,
+                    image=normalized_image,
+                    height=height,
+                    width=width,
+                    device=device,
+                    generator=ar_generator,
+                )
+            )
+        else:
+            # User provided prior_token_ids directly (from generate_prior_tokens)
+            prior_token_image_ids_per_sample = prior_token_image_ids
+            source_image_grid_thw_per_sample = source_image_grid_thw
+
+        # 4. Preprocess images for VAE encoding
+        preprocessed_images = None
+        if normalized_image is not None:
+            preprocessed_images = []
+            for prompt_images in normalized_image:
+                prompt_preprocessed = []
+                for img in prompt_images:
+                    image_height, image_width = img.size[::-1] if isinstance(img, PIL.Image.Image) else img.shape[:2]
+                    multiple_of = self.vae_scale_factor * self.transformer.config.patch_size
+                    image_height = (image_height // multiple_of) * multiple_of
+                    image_width = (image_width // multiple_of) * multiple_of
+                    img = self.image_processor.preprocess(img, height=image_height, width=image_width)
+                    prompt_preprocessed.append(img)
+                    height = height or image_height
+                    width = width or image_width
+                preprocessed_images.append(prompt_preprocessed)
 
         # 5. Encode input prompt
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
@@ -652,7 +866,7 @@ class GlmImagePipeline(DiffusionPipeline):
             dtype=self.dtype,
         )
 
-        # 4. Prepare latents and (optional) image kv cache
+        # 6. Prepare latents and (optional) image kv cache
         latent_channels = self.transformer.config.in_channels
         latents = self.prepare_latents(
             batch_size=batch_size * num_images_per_prompt,
@@ -666,7 +880,7 @@ class GlmImagePipeline(DiffusionPipeline):
         )
         kv_caches = GlmImageKVCache(num_layers=self.transformer.config.num_layers)
 
-        if image is not None:
+        if normalized_image is not None:
             kv_caches.set_mode("write")
             latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.latent_channels, 1, 1)
             latents_std = torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.latent_channels, 1, 1)
@@ -674,29 +888,38 @@ class GlmImagePipeline(DiffusionPipeline):
             latents_mean = latents_mean.to(device=device, dtype=prompt_embeds.dtype)
             latents_std = latents_std.to(device=device, dtype=prompt_embeds.dtype)
 
-            for condition_image, condition_image_prior_token_id in zip(image, prior_token_image_ids):
-                condition_image = condition_image.to(device=device, dtype=prompt_embeds.dtype)
-                condition_latent = retrieve_latents(
-                    self.vae.encode(condition_image), generator=generator, sample_mode="argmax"
-                )
-                condition_latent = (condition_latent - latents_mean) / latents_std
+            # Process each sample's condition images
+            for prompt_idx in range(batch_size):
+                prompt_images = preprocessed_images[prompt_idx]
+                prompt_prior_ids = prior_token_image_ids_per_sample[prompt_idx]
+                prompt_grid_thw = source_image_grid_thw_per_sample[prompt_idx]
 
-                # Do not remove.
-                # It would be use to run the reference image through a
-                # forward pass at timestep 0 and keep the KV cache.
-                _ = self.transformer(
-                    hidden_states=condition_latent,
-                    encoder_hidden_states=torch.zeros_like(prompt_embeds)[:1, :0, ...],
-                    prior_token_id=condition_image_prior_token_id,
-                    prior_token_drop=torch.full_like(condition_image_prior_token_id, False, dtype=torch.bool),
-                    timestep=torch.zeros((1,), device=device),
-                    target_size=torch.tensor([condition_image.shape[-2:]], device=device),
-                    crop_coords=torch.zeros((1, 2), device=device),
-                    attention_kwargs=attention_kwargs,
-                    kv_caches=kv_caches,
-                )
+                # Split this sample's prior_token_image_ids by each image's token count
+                split_sizes = prompt_grid_thw.prod(dim=-1).tolist()
+                prior_ids_per_image = torch.split(prompt_prior_ids, split_sizes)
+                # Process each condition image for this sample
+                for condition_image, condition_image_prior_token_id in zip(prompt_images, prior_ids_per_image):
+                    condition_image = condition_image.to(device=device, dtype=prompt_embeds.dtype)
+                    condition_latent = retrieve_latents(
+                        self.vae.encode(condition_image), generator=generator, sample_mode="argmax"
+                    )
+                    condition_latent = (condition_latent - latents_mean) / latents_std
 
-        # 6. Prepare additional timestep conditions
+                    _ = self.transformer(
+                        hidden_states=condition_latent,
+                        encoder_hidden_states=torch.zeros_like(prompt_embeds)[:1, :0, ...],
+                        prior_token_id=condition_image_prior_token_id,
+                        prior_token_drop=torch.full_like(condition_image_prior_token_id, False, dtype=torch.bool),
+                        timestep=torch.zeros((1,), device=device),
+                        target_size=torch.tensor([condition_image.shape[-2:]], device=device),
+                        crop_coords=torch.zeros((1, 2), device=device),
+                        attention_kwargs=attention_kwargs,
+                        kv_caches=kv_caches,
+                    )
+                # Move to next sample's cache slot
+                kv_caches.next_sample()
+
+        # 7. Prepare additional timestep conditions
         target_size = (height, width)
         target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype, device=device)
         crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype, device=device)
@@ -726,10 +949,13 @@ class GlmImagePipeline(DiffusionPipeline):
         )
         self._num_timesteps = len(timesteps)
 
-        # 7. Denoising loop
+        # 8. Denoising loop
         transformer_dtype = self.transformer.dtype
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
+        # Repeat prior_token_ids for num_images_per_prompt
+        if num_images_per_prompt > 1:
+            prior_token_ids = prior_token_ids.repeat_interleave(num_images_per_prompt, dim=0)
         prior_token_drop_cond = torch.full_like(prior_token_ids, False, dtype=torch.bool)
         prior_token_drop_uncond = torch.full_like(prior_token_ids, True, dtype=torch.bool)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -742,7 +968,7 @@ class GlmImagePipeline(DiffusionPipeline):
 
                 timestep = t.expand(latents.shape[0]) - 1
 
-                if image is not None:
+                if prior_token_image_ids_per_sample is not None:
                     kv_caches.set_mode("read")
 
                 noise_pred_cond = self.transformer(
@@ -760,7 +986,7 @@ class GlmImagePipeline(DiffusionPipeline):
 
                 # perform guidance
                 if self.do_classifier_free_guidance:
-                    if image is not None:
+                    if prior_token_image_ids_per_sample is not None:
                         kv_caches.set_mode("skip")
                     noise_pred_uncond = self.transformer(
                         hidden_states=latent_model_input,
diff --git a/tests/pipelines/glm_image/test_glm_image.py b/tests/pipelines/glm_image/test_glm_image.py
index 7a380b99b0..d907d082d2 100644
--- a/tests/pipelines/glm_image/test_glm_image.py
+++ b/tests/pipelines/glm_image/test_glm_image.py
@@ -169,7 +169,7 @@ class GlmImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         # fmt: off
         expected_slice = np.array(
             [
-                0.5796329,  0.5005878,  0.45881274, 0.45331675, 0.43688118, 0.4899527, 0.54017603, 0.50983673, 0.3387968,  0.38074082, 0.29942477, 0.33733928, 0.3672544,  0.38462338, 0.40991822, 0.46641728
+                0.5849247, 0.50278825, 0.45747858, 0.45895284, 0.43804976, 0.47044256, 0.5239665, 0.47904694, 0.3323419, 0.38725388, 0.28505728, 0.3161863, 0.35026982, 0.37546024, 0.4090118, 0.46629113
             ]
         )
         # fmt: on
@@ -177,20 +177,109 @@ class GlmImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         self.assertEqual(image.shape, (3, 32, 32))
         self.assertTrue(np.allclose(expected_slice, generated_slice, atol=1e-4, rtol=1e-4))
 
-    @unittest.skip("Not supported.")
     def test_inference_batch_single_identical(self):
-        # GLM-Image has batch_size=1 constraint due to AR model
-        pass
+        """Test that batch=1 produces consistent results with the same seed."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
 
-    @unittest.skip("Not supported.")
-    def test_inference_batch_consistent(self):
-        # GLM-Image has batch_size=1 constraint due to AR model
-        pass
+        # Run twice with same seed
+        inputs1 = self.get_dummy_inputs(device, seed=42)
+        inputs2 = self.get_dummy_inputs(device, seed=42)
+
+        image1 = pipe(**inputs1).images[0]
+        image2 = pipe(**inputs2).images[0]
+
+        self.assertTrue(torch.allclose(image1, image2, atol=1e-4))
+
+    def test_inference_batch_multiple_prompts(self):
+        """Test batch processing with multiple prompts."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(42)
+        height, width = 32, 32
+
+        inputs = {
+            "prompt": ["A photo of a cat", "A photo of a dog"],
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 1.5,
+            "height": height,
+            "width": width,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+
+        images = pipe(**inputs).images
+
+        # Should return 2 images
+        self.assertEqual(len(images), 2)
+        self.assertEqual(images[0].shape, (3, 32, 32))
+        self.assertEqual(images[1].shape, (3, 32, 32))
 
-    @unittest.skip("Not supported.")
     def test_num_images_per_prompt(self):
-        # GLM-Image has batch_size=1 constraint due to AR model
-        pass
+        """Test generating multiple images per prompt."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(42)
+        height, width = 32, 32
+
+        inputs = {
+            "prompt": "A photo of a cat",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 1.5,
+            "height": height,
+            "width": width,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+            "num_images_per_prompt": 2,
+        }
+
+        images = pipe(**inputs).images
+
+        # Should return 2 images for single prompt
+        self.assertEqual(len(images), 2)
+        self.assertEqual(images[0].shape, (3, 32, 32))
+        self.assertEqual(images[1].shape, (3, 32, 32))
+
+    def test_batch_with_num_images_per_prompt(self):
+        """Test batch prompts with num_images_per_prompt > 1."""
+        device = "cpu"
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = torch.Generator(device=device).manual_seed(42)
+        height, width = 32, 32
+
+        inputs = {
+            "prompt": ["A photo of a cat", "A photo of a dog"],
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 1.5,
+            "height": height,
+            "width": width,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+            "num_images_per_prompt": 2,
+        }
+
+        images = pipe(**inputs).images
+
+        # Should return 4 images (2 prompts × 2 images per prompt)
+        self.assertEqual(len(images), 4)
 
     @unittest.skip("Needs to be revisited.")
     def test_encode_prompt_works_in_isolation(self):

From d54669a73e62880bec8cf8d937cf1796aed4d609 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Tue, 27 Jan 2026 23:42:48 +0100
Subject: [PATCH 06/10] [Qwen] avoid creating attention masks when there is no
 padding (#12987)

* avoid creating attention masks when there is no padding

* make fix-copies

* torch compile tests

* set all ones mask to none

* fix positional encoding from becoming > 4096

* fix from review

* slice freqs_cis to match the input sequence length

* keep only attenton masking change

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
---
 .../pipelines/qwenimage/pipeline_qwenimage.py |  3 +
 .../pipeline_qwenimage_controlnet.py          |  3 +
 .../pipeline_qwenimage_controlnet_inpaint.py  |  3 +
 .../qwenimage/pipeline_qwenimage_edit.py      |  3 +
 .../pipeline_qwenimage_edit_inpaint.py        |  3 +
 .../qwenimage/pipeline_qwenimage_edit_plus.py |  3 +
 .../qwenimage/pipeline_qwenimage_img2img.py   |  3 +
 .../qwenimage/pipeline_qwenimage_inpaint.py   |  3 +
 .../qwenimage/pipeline_qwenimage_layered.py   |  3 +
 .../test_models_transformer_qwenimage.py      | 71 +++++++++++++++++++
 10 files changed, 98 insertions(+)

diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
index bc3ce84e10..21515df608 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
@@ -262,6 +262,9 @@ class QwenImagePipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
index ce6fc974a5..714ec04238 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
@@ -324,6 +324,9 @@ class QwenImageControlNetPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py
index 77d78a5ca7..f8521318e6 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet_inpaint.py
@@ -305,6 +305,9 @@ class QwenImageControlNetInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderM
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
index dd723460a5..353aadcbf0 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit.py
@@ -309,6 +309,9 @@ class QwenImageEditPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
index cf467203a9..75be62cf6d 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
@@ -321,6 +321,9 @@ class QwenImageEditInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_inpaint.QwenImageInpaintPipeline.check_inputs
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
index 257e2d846c..bc688aeee3 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_plus.py
@@ -323,6 +323,9 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.check_inputs
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
index e0b41b8b87..2c9da7545e 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
@@ -305,6 +305,9 @@ class QwenImageImg2ImgPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
index 83f02539b1..536a7984e8 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
@@ -316,6 +316,9 @@ class QwenImageInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def check_inputs(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
index 53d2c169ee..0a53a0ac77 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
@@ -328,6 +328,9 @@ the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>as
         prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
         prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
+        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
+            prompt_embeds_mask = None
+
         return prompt_embeds, prompt_embeds_mask
 
     def get_image_caption(self, prompt_image, use_en_prompt=True, device=None):
diff --git a/tests/models/transformers/test_models_transformer_qwenimage.py b/tests/models/transformers/test_models_transformer_qwenimage.py
index 384954dfba..e6b19377b1 100644
--- a/tests/models/transformers/test_models_transformer_qwenimage.py
+++ b/tests/models/transformers/test_models_transformer_qwenimage.py
@@ -276,3 +276,74 @@ class QwenImageTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCas
 
     def test_torch_compile_recompilation_and_graph_break(self):
         super().test_torch_compile_recompilation_and_graph_break()
+
+    def test_torch_compile_with_and_without_mask(self):
+        """Test that torch.compile works with both None mask and padding mask."""
+        init_dict, inputs = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict).to(torch_device)
+        model.eval()
+        model.compile(mode="default", fullgraph=True)
+
+        # Test 1: Run with None mask (no padding, all tokens are valid)
+        inputs_no_mask = inputs.copy()
+        inputs_no_mask["encoder_hidden_states_mask"] = None
+
+        # First run to allow compilation
+        with torch.no_grad():
+            output_no_mask = model(**inputs_no_mask)
+
+        # Second run to verify no recompilation
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(error_on_recompile=True),
+            torch.no_grad(),
+        ):
+            output_no_mask_2 = model(**inputs_no_mask)
+
+        self.assertEqual(output_no_mask.sample.shape[1], inputs["hidden_states"].shape[1])
+        self.assertEqual(output_no_mask_2.sample.shape[1], inputs["hidden_states"].shape[1])
+
+        # Test 2: Run with all-ones mask (should behave like None)
+        inputs_all_ones = inputs.copy()
+        # Keep the all-ones mask
+        self.assertTrue(inputs_all_ones["encoder_hidden_states_mask"].all().item())
+
+        # First run to allow compilation
+        with torch.no_grad():
+            output_all_ones = model(**inputs_all_ones)
+
+        # Second run to verify no recompilation
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(error_on_recompile=True),
+            torch.no_grad(),
+        ):
+            output_all_ones_2 = model(**inputs_all_ones)
+
+        self.assertEqual(output_all_ones.sample.shape[1], inputs["hidden_states"].shape[1])
+        self.assertEqual(output_all_ones_2.sample.shape[1], inputs["hidden_states"].shape[1])
+
+        # Test 3: Run with actual padding mask (has zeros)
+        inputs_with_padding = inputs.copy()
+        mask_with_padding = inputs["encoder_hidden_states_mask"].clone()
+        mask_with_padding[:, 4:] = 0  # Last 3 tokens are padding
+
+        inputs_with_padding["encoder_hidden_states_mask"] = mask_with_padding
+
+        # First run to allow compilation
+        with torch.no_grad():
+            output_with_padding = model(**inputs_with_padding)
+
+        # Second run to verify no recompilation
+        with (
+            torch._inductor.utils.fresh_inductor_cache(),
+            torch._dynamo.config.patch(error_on_recompile=True),
+            torch.no_grad(),
+        ):
+            output_with_padding_2 = model(**inputs_with_padding)
+
+        self.assertEqual(output_with_padding.sample.shape[1], inputs["hidden_states"].shape[1])
+        self.assertEqual(output_with_padding_2.sample.shape[1], inputs["hidden_states"].shape[1])
+
+        # Verify that outputs are different (mask should affect results)
+        self.assertFalse(torch.allclose(output_no_mask.sample, output_with_padding.sample, atol=1e-3))

From 53d8a1e3100bdaa30975324e9ff674091684a1c3 Mon Sep 17 00:00:00 2001
From: YiYi Xu <yixu310@gmail.com>
Date: Tue, 27 Jan 2026 15:43:14 -1000
Subject: [PATCH 07/10] [modular]support klein (#13002)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* support klein

* style

* copies

* Apply suggestions from code review

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>

* Update src/diffusers/modular_pipelines/flux2/encoders.py

* a few fix: unpack latents before decoder etc

* style

* remove guidannce to its own block

* style

* flux2-dev work in modular setting

* up

* up up

* add tests

---------

Co-authored-by: yiyi@huggingface.co <yiyi@ip-26-0-160-103.ec2.internal>
Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>
---
 src/diffusers/__init__.py                     |   6 +
 src/diffusers/modular_pipelines/__init__.py   |  11 +-
 .../modular_pipelines/flux2/__init__.py       |  17 +-
 .../modular_pipelines/flux2/before_denoise.py | 130 +++++--
 .../modular_pipelines/flux2/decoders.py       | 107 ++++--
 .../modular_pipelines/flux2/denoise.py        | 261 +++++++++++++-
 .../modular_pipelines/flux2/encoders.py       | 322 ++++++++++++++++--
 .../modular_pipelines/flux2/inputs.py         |  86 ++++-
 ...ular_blocks.py => modular_blocks_flux2.py} |  72 +++-
 .../flux2/modular_blocks_flux2_klein.py       | 232 +++++++++++++
 .../flux2/modular_pipeline.py                 |  55 +++
 .../modular_pipelines/modular_pipeline.py     |   1 +
 .../dummy_torch_and_transformers_objects.py   |  45 +++
 .../test_modular_pipeline_flux2_klein.py      |  91 +++++
 .../test_modular_pipeline_flux2_klein_base.py |  91 +++++
 15 files changed, 1412 insertions(+), 115 deletions(-)
 rename src/diffusers/modular_pipelines/flux2/{modular_blocks.py => modular_blocks_flux2.py} (64%)
 create mode 100644 src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
 create mode 100644 tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
 create mode 100644 tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py

diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 24b9c12db6..52ec30c536 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -413,6 +413,9 @@ else:
     _import_structure["modular_pipelines"].extend(
         [
             "Flux2AutoBlocks",
+            "Flux2KleinAutoBlocks",
+            "Flux2KleinBaseAutoBlocks",
+            "Flux2KleinModularPipeline",
             "Flux2ModularPipeline",
             "FluxAutoBlocks",
             "FluxKontextAutoBlocks",
@@ -1146,6 +1149,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     else:
         from .modular_pipelines import (
             Flux2AutoBlocks,
+            Flux2KleinAutoBlocks,
+            Flux2KleinBaseAutoBlocks,
+            Flux2KleinModularPipeline,
             Flux2ModularPipeline,
             FluxAutoBlocks,
             FluxKontextAutoBlocks,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index e64db23f38..823a3d263e 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -54,7 +54,10 @@ else:
     ]
     _import_structure["flux2"] = [
         "Flux2AutoBlocks",
+        "Flux2KleinAutoBlocks",
+        "Flux2KleinBaseAutoBlocks",
         "Flux2ModularPipeline",
+        "Flux2KleinModularPipeline",
     ]
     _import_structure["qwenimage"] = [
         "QwenImageAutoBlocks",
@@ -81,7 +84,13 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     else:
         from .components_manager import ComponentsManager
         from .flux import FluxAutoBlocks, FluxKontextAutoBlocks, FluxKontextModularPipeline, FluxModularPipeline
-        from .flux2 import Flux2AutoBlocks, Flux2ModularPipeline
+        from .flux2 import (
+            Flux2AutoBlocks,
+            Flux2KleinAutoBlocks,
+            Flux2KleinBaseAutoBlocks,
+            Flux2KleinModularPipeline,
+            Flux2ModularPipeline,
+        )
         from .modular_pipeline import (
             AutoPipelineBlocks,
             BlockState,
diff --git a/src/diffusers/modular_pipelines/flux2/__init__.py b/src/diffusers/modular_pipelines/flux2/__init__.py
index 21a41c1fe9..220ec0c4ab 100644
--- a/src/diffusers/modular_pipelines/flux2/__init__.py
+++ b/src/diffusers/modular_pipelines/flux2/__init__.py
@@ -43,7 +43,7 @@ else:
         "Flux2ProcessImagesInputStep",
         "Flux2TextInputStep",
     ]
-    _import_structure["modular_blocks"] = [
+    _import_structure["modular_blocks_flux2"] = [
         "ALL_BLOCKS",
         "AUTO_BLOCKS",
         "REMOTE_AUTO_BLOCKS",
@@ -51,10 +51,11 @@ else:
         "IMAGE_CONDITIONED_BLOCKS",
         "Flux2AutoBlocks",
         "Flux2AutoVaeEncoderStep",
-        "Flux2BeforeDenoiseStep",
+        "Flux2CoreDenoiseStep",
         "Flux2VaeEncoderSequentialStep",
     ]
-    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline"]
+    _import_structure["modular_blocks_flux2_klein"] = ["Flux2KleinAutoBlocks", "Flux2KleinBaseAutoBlocks"]
+    _import_structure["modular_pipeline"] = ["Flux2ModularPipeline", "Flux2KleinModularPipeline"]
 
 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     try:
@@ -85,7 +86,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             Flux2ProcessImagesInputStep,
             Flux2TextInputStep,
         )
-        from .modular_blocks import (
+        from .modular_blocks_flux2 import (
             ALL_BLOCKS,
             AUTO_BLOCKS,
             IMAGE_CONDITIONED_BLOCKS,
@@ -93,10 +94,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             TEXT2IMAGE_BLOCKS,
             Flux2AutoBlocks,
             Flux2AutoVaeEncoderStep,
-            Flux2BeforeDenoiseStep,
+            Flux2CoreDenoiseStep,
             Flux2VaeEncoderSequentialStep,
         )
-        from .modular_pipeline import Flux2ModularPipeline
+        from .modular_blocks_flux2_klein import (
+            Flux2KleinAutoBlocks,
+            Flux2KleinBaseAutoBlocks,
+        )
+        from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
 else:
     import sys
 
diff --git a/src/diffusers/modular_pipelines/flux2/before_denoise.py b/src/diffusers/modular_pipelines/flux2/before_denoise.py
index 42624688ad..d5bab16586 100644
--- a/src/diffusers/modular_pipelines/flux2/before_denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/before_denoise.py
@@ -129,17 +129,9 @@ class Flux2SetTimestepsStep(ModularPipelineBlocks):
             InputParam("num_inference_steps", default=50),
             InputParam("timesteps"),
             InputParam("sigmas"),
-            InputParam("guidance_scale", default=4.0),
             InputParam("latents", type_hint=torch.Tensor),
-            InputParam("num_images_per_prompt", default=1),
             InputParam("height", type_hint=int),
             InputParam("width", type_hint=int),
-            InputParam(
-                "batch_size",
-                required=True,
-                type_hint=int,
-                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.",
-            ),
         ]
 
     @property
@@ -151,13 +143,12 @@ class Flux2SetTimestepsStep(ModularPipelineBlocks):
                 type_hint=int,
                 description="The number of denoising steps to perform at inference time",
             ),
-            OutputParam("guidance", type_hint=torch.Tensor, description="Guidance scale tensor"),
         ]
 
     @torch.no_grad()
     def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
-        block_state.device = components._execution_device
+        device = components._execution_device
 
         scheduler = components.scheduler
 
@@ -183,7 +174,7 @@ class Flux2SetTimestepsStep(ModularPipelineBlocks):
         timesteps, num_inference_steps = retrieve_timesteps(
             scheduler,
             num_inference_steps,
-            block_state.device,
+            device,
             timesteps=timesteps,
             sigmas=sigmas,
             mu=mu,
@@ -191,11 +182,6 @@ class Flux2SetTimestepsStep(ModularPipelineBlocks):
         block_state.timesteps = timesteps
         block_state.num_inference_steps = num_inference_steps
 
-        batch_size = block_state.batch_size * block_state.num_images_per_prompt
-        guidance = torch.full([1], block_state.guidance_scale, device=block_state.device, dtype=torch.float32)
-        guidance = guidance.expand(batch_size)
-        block_state.guidance = guidance
-
         components.scheduler.set_begin_index(0)
 
         self.set_block_state(state, block_state)
@@ -353,7 +339,6 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(name="prompt_embeds", required=True),
-            InputParam(name="latent_ids"),
         ]
 
     @property
@@ -365,12 +350,6 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.",
             ),
-            OutputParam(
-                name="latent_ids",
-                kwargs_type="denoiser_input_fields",
-                type_hint=torch.Tensor,
-                description="4D position IDs (T, H, W, L) for image latents, used for RoPE calculation.",
-            ),
         ]
 
     @staticmethod
@@ -403,6 +382,72 @@ class Flux2RoPEInputsStep(ModularPipelineBlocks):
         return components, state
 
 
+class Flux2KleinBaseRoPEInputsStep(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the 4D RoPE position IDs for Flux2-Klein base model denoising. Should be placed after text encoder and latent preparation steps."
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="prompt_embeds", required=True),
+            InputParam(name="negative_prompt_embeds", required=False),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                name="txt_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="4D position IDs (T, H, W, L) for text tokens, used for RoPE calculation.",
+            ),
+            OutputParam(
+                name="negative_txt_ids",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="4D position IDs (T, H, W, L) for negative text tokens, used for RoPE calculation.",
+            ),
+        ]
+
+    @staticmethod
+    def _prepare_text_ids(x: torch.Tensor, t_coord: Optional[torch.Tensor] = None):
+        """Prepare 4D position IDs for text tokens."""
+        B, L, _ = x.shape
+        out_ids = []
+
+        for i in range(B):
+            t = torch.arange(1) if t_coord is None else t_coord[i]
+            h = torch.arange(1)
+            w = torch.arange(1)
+            seq_l = torch.arange(L)
+
+            coords = torch.cartesian_prod(t, h, w, seq_l)
+            out_ids.append(coords)
+
+        return torch.stack(out_ids)
+
+    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        prompt_embeds = block_state.prompt_embeds
+        device = prompt_embeds.device
+
+        block_state.txt_ids = self._prepare_text_ids(prompt_embeds)
+        block_state.txt_ids = block_state.txt_ids.to(device)
+
+        block_state.negative_txt_ids = None
+        if block_state.negative_prompt_embeds is not None:
+            block_state.negative_txt_ids = self._prepare_text_ids(block_state.negative_prompt_embeds)
+            block_state.negative_txt_ids = block_state.negative_txt_ids.to(device)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class Flux2PrepareImageLatentsStep(ModularPipelineBlocks):
     model_name = "flux2"
 
@@ -506,3 +551,42 @@ class Flux2PrepareImageLatentsStep(ModularPipelineBlocks):
 
         self.set_block_state(state, block_state)
         return components, state
+
+
+class Flux2PrepareGuidanceStep(ModularPipelineBlocks):
+    model_name = "flux2"
+
+    @property
+    def description(self) -> str:
+        return "Step that prepares the guidance scale tensor for Flux2 inference"
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("guidance_scale", default=4.0),
+            InputParam("num_images_per_prompt", default=1),
+            InputParam(
+                "batch_size",
+                required=True,
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam("guidance", type_hint=torch.Tensor, description="Guidance scale tensor"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        batch_size = block_state.batch_size * block_state.num_images_per_prompt
+        guidance = torch.full([1], block_state.guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(batch_size)
+        block_state.guidance = guidance
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/flux2/decoders.py b/src/diffusers/modular_pipelines/flux2/decoders.py
index b769d91198..c793750720 100644
--- a/src/diffusers/modular_pipelines/flux2/decoders.py
+++ b/src/diffusers/modular_pipelines/flux2/decoders.py
@@ -29,29 +29,16 @@ from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class Flux2DecodeStep(ModularPipelineBlocks):
+class Flux2UnpackLatentsStep(ModularPipelineBlocks):
     model_name = "flux2"
 
-    @property
-    def expected_components(self) -> List[ComponentSpec]:
-        return [
-            ComponentSpec("vae", AutoencoderKLFlux2),
-            ComponentSpec(
-                "image_processor",
-                Flux2ImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 32}),
-                default_creation_method="from_config",
-            ),
-        ]
-
     @property
     def description(self) -> str:
-        return "Step that decodes the denoised latents into images using Flux2 VAE with batch norm denormalization"
+        return "Step that unpacks the latents from the denoising step"
 
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
-            InputParam("output_type", default="pil"),
             InputParam(
                 "latents",
                 required=True,
@@ -70,9 +57,9 @@ class Flux2DecodeStep(ModularPipelineBlocks):
     def intermediate_outputs(self) -> List[str]:
         return [
             OutputParam(
-                "images",
-                type_hint=Union[List[PIL.Image.Image], torch.Tensor, np.ndarray],
-                description="The generated images, can be a list of PIL.Image.Image, torch.Tensor or a numpy array",
+                "latents",
+                type_hint=torch.Tensor,
+                description="The denoise latents from denoising step, unpacked with position IDs.",
             )
         ]
 
@@ -107,6 +94,62 @@ class Flux2DecodeStep(ModularPipelineBlocks):
 
         return torch.stack(x_list, dim=0)
 
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        latents = block_state.latents
+        latent_ids = block_state.latent_ids
+
+        latents = self._unpack_latents_with_ids(latents, latent_ids)
+
+        block_state.latents = latents
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class Flux2DecodeStep(ModularPipelineBlocks):
+    model_name = "flux2"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLFlux2),
+            ComponentSpec(
+                "image_processor",
+                Flux2ImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 32}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def description(self) -> str:
+        return "Step that decodes the denoised latents into images using Flux2 VAE with batch norm denormalization"
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("output_type", default="pil"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised latents from the denoising step",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "images",
+                type_hint=Union[List[PIL.Image.Image], torch.Tensor, np.ndarray],
+                description="The generated images, can be a list of PIL.Image.Image, torch.Tensor or a numpy array",
+            )
+        ]
+
     @staticmethod
     def _unpatchify_latents(latents):
         """Convert patchified latents back to regular format."""
@@ -121,26 +164,20 @@ class Flux2DecodeStep(ModularPipelineBlocks):
         block_state = self.get_block_state(state)
         vae = components.vae
 
-        if block_state.output_type == "latent":
-            block_state.images = block_state.latents
-        else:
-            latents = block_state.latents
-            latent_ids = block_state.latent_ids
+        latents = block_state.latents
 
-            latents = self._unpack_latents_with_ids(latents, latent_ids)
+        latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
+        latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
+            latents.device, latents.dtype
+        )
+        latents = latents * latents_bn_std + latents_bn_mean
 
-            latents_bn_mean = vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype)
-            latents_bn_std = torch.sqrt(vae.bn.running_var.view(1, -1, 1, 1) + vae.config.batch_norm_eps).to(
-                latents.device, latents.dtype
-            )
-            latents = latents * latents_bn_std + latents_bn_mean
+        latents = self._unpatchify_latents(latents)
 
-            latents = self._unpatchify_latents(latents)
-
-            block_state.images = vae.decode(latents, return_dict=False)[0]
-            block_state.images = components.image_processor.postprocess(
-                block_state.images, output_type=block_state.output_type
-            )
+        block_state.images = vae.decode(latents, return_dict=False)[0]
+        block_state.images = components.image_processor.postprocess(
+            block_state.images, output_type=block_state.output_type
+        )
 
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/flux2/denoise.py b/src/diffusers/modular_pipelines/flux2/denoise.py
index c12eca65c6..a726959a29 100644
--- a/src/diffusers/modular_pipelines/flux2/denoise.py
+++ b/src/diffusers/modular_pipelines/flux2/denoise.py
@@ -16,6 +16,8 @@ from typing import Any, List, Tuple
 
 import torch
 
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
 from ...models import Flux2Transformer2DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import is_torch_xla_available, logging
@@ -25,8 +27,8 @@ from ..modular_pipeline import (
     ModularPipelineBlocks,
     PipelineState,
 )
-from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
-from .modular_pipeline import Flux2ModularPipeline
+from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
 
 
 if is_torch_xla_available():
@@ -134,6 +136,229 @@ class Flux2LoopDenoiser(ModularPipelineBlocks):
         return components, block_state
 
 
+# same as Flux2LoopDenoiser but guidance=None
+class Flux2KleinLoopDenoiser(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [ComponentSpec("transformer", Flux2Transformer2DModel)]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoises the latents for Flux2. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `Flux2DenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("joint_attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to denoise. Shape: (B, seq_len, C)",
+            ),
+            InputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="Packed image latents for conditioning. Shape: (B, img_seq_len, C)",
+            ),
+            InputParam(
+                "image_latent_ids",
+                type_hint=torch.Tensor,
+                description="Position IDs for image latents. Shape: (B, img_seq_len, 4)",
+            ),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Text embeddings from Qwen3",
+            ),
+            InputParam(
+                "txt_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="4D position IDs for text tokens (T, H, W, L)",
+            ),
+            InputParam(
+                "latent_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="4D position IDs for latent tokens (T, H, W, L)",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(
+        self, components: Flux2KleinModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        latents = block_state.latents
+        latent_model_input = latents.to(components.transformer.dtype)
+        img_ids = block_state.latent_ids
+
+        image_latents = getattr(block_state, "image_latents", None)
+        if image_latents is not None:
+            latent_model_input = torch.cat([latents, image_latents], dim=1).to(components.transformer.dtype)
+            image_latent_ids = block_state.image_latent_ids
+            img_ids = torch.cat([img_ids, image_latent_ids], dim=1)
+
+        timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+        noise_pred = components.transformer(
+            hidden_states=latent_model_input,
+            timestep=timestep / 1000,
+            guidance=None,
+            encoder_hidden_states=block_state.prompt_embeds,
+            txt_ids=block_state.txt_ids,
+            img_ids=img_ids,
+            joint_attention_kwargs=block_state.joint_attention_kwargs,
+            return_dict=False,
+        )[0]
+
+        noise_pred = noise_pred[:, : latents.size(1)]
+        block_state.noise_pred = noise_pred
+
+        return components, block_state
+
+
+# support CFG for Flux2-Klein base model
+class Flux2KleinBaseLoopDenoiser(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", Flux2Transformer2DModel),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(name="is_distilled", default=False),
+        ]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Step within the denoising loop that denoises the latents for Flux2. "
+            "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `Flux2DenoiseLoopWrapper`)"
+        )
+
+    @property
+    def inputs(self) -> List[Tuple[str, Any]]:
+        return [
+            InputParam("joint_attention_kwargs"),
+            InputParam(
+                "latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to denoise. Shape: (B, seq_len, C)",
+            ),
+            InputParam(
+                "image_latents",
+                type_hint=torch.Tensor,
+                description="Packed image latents for conditioning. Shape: (B, img_seq_len, C)",
+            ),
+            InputParam(
+                "image_latent_ids",
+                type_hint=torch.Tensor,
+                description="Position IDs for image latents. Shape: (B, img_seq_len, 4)",
+            ),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Text embeddings from Qwen3",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                required=False,
+                type_hint=torch.Tensor,
+                description="Negative text embeddings from Qwen3",
+            ),
+            InputParam(
+                "txt_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="4D position IDs for text tokens (T, H, W, L)",
+            ),
+            InputParam(
+                "negative_txt_ids",
+                required=False,
+                type_hint=torch.Tensor,
+                description="4D position IDs for negative text tokens (T, H, W, L)",
+            ),
+            InputParam(
+                "latent_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="4D position IDs for latent tokens (T, H, W, L)",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(
+        self, components: Flux2KleinModularPipeline, block_state: BlockState, i: int, t: torch.Tensor
+    ) -> PipelineState:
+        latents = block_state.latents
+        latent_model_input = latents.to(components.transformer.dtype)
+        img_ids = block_state.latent_ids
+
+        image_latents = getattr(block_state, "image_latents", None)
+        if image_latents is not None:
+            latent_model_input = torch.cat([latents, image_latents], dim=1).to(components.transformer.dtype)
+            image_latent_ids = block_state.image_latent_ids
+            img_ids = torch.cat([img_ids, image_latent_ids], dim=1)
+
+        timestep = t.expand(latents.shape[0]).to(latents.dtype)
+
+        guider_inputs = {
+            "encoder_hidden_states": (
+                getattr(block_state, "prompt_embeds", None),
+                getattr(block_state, "negative_prompt_embeds", None),
+            ),
+            "txt_ids": (
+                getattr(block_state, "txt_ids", None),
+                getattr(block_state, "negative_txt_ids", None),
+            ),
+        }
+
+        components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
+        guider_state = components.guider.prepare_inputs(guider_inputs)
+
+        for guider_state_batch in guider_state:
+            components.guider.prepare_models(components.transformer)
+            cond_kwargs = {input_name: getattr(guider_state_batch, input_name) for input_name in guider_inputs.keys()}
+
+            noise_pred = components.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep / 1000,
+                guidance=None,
+                img_ids=img_ids,
+                joint_attention_kwargs=block_state.joint_attention_kwargs,
+                return_dict=False,
+                **cond_kwargs,
+            )[0]
+            guider_state_batch.noise_pred = noise_pred[:, : latents.size(1)]
+            components.guider.cleanup_models(components.transformer)
+
+        # perform guidance
+        block_state.noise_pred = components.guider(guider_state)[0]
+
+        return components, block_state
+
+
 class Flux2LoopAfterDenoiser(ModularPipelineBlocks):
     model_name = "flux2"
 
@@ -250,3 +475,35 @@ class Flux2DenoiseStep(Flux2DenoiseLoopWrapper):
             " - `Flux2LoopAfterDenoiser`\n"
             "This block supports both text-to-image and image-conditioned generation."
         )
+
+
+class Flux2KleinDenoiseStep(Flux2DenoiseLoopWrapper):
+    block_classes = [Flux2KleinLoopDenoiser, Flux2LoopAfterDenoiser]
+    block_names = ["denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents for Flux2. \n"
+            "Its loop logic is defined in `Flux2DenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `Flux2KleinLoopDenoiser`\n"
+            " - `Flux2LoopAfterDenoiser`\n"
+            "This block supports both text-to-image and image-conditioned generation."
+        )
+
+
+class Flux2KleinBaseDenoiseStep(Flux2DenoiseLoopWrapper):
+    block_classes = [Flux2KleinBaseLoopDenoiser, Flux2LoopAfterDenoiser]
+    block_names = ["denoiser", "after_denoiser"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "Denoise step that iteratively denoises the latents for Flux2. \n"
+            "Its loop logic is defined in `Flux2DenoiseLoopWrapper.__call__` method \n"
+            "At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
+            " - `Flux2KleinBaseLoopDenoiser`\n"
+            " - `Flux2LoopAfterDenoiser`\n"
+            "This block supports both text-to-image and image-conditioned generation."
+        )
diff --git a/src/diffusers/modular_pipelines/flux2/encoders.py b/src/diffusers/modular_pipelines/flux2/encoders.py
index 6cb0e3bf0a..265fb38736 100644
--- a/src/diffusers/modular_pipelines/flux2/encoders.py
+++ b/src/diffusers/modular_pipelines/flux2/encoders.py
@@ -15,13 +15,15 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+from transformers import AutoProcessor, Mistral3ForConditionalGeneration, Qwen2TokenizerFast, Qwen3ForCausalLM
 
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
 from ...models import AutoencoderKLFlux2
 from ...utils import logging
 from ..modular_pipeline import ModularPipelineBlocks, PipelineState
-from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
-from .modular_pipeline import Flux2ModularPipeline
+from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
+from .modular_pipeline import Flux2KleinModularPipeline, Flux2ModularPipeline
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -79,10 +81,8 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("prompt"),
-            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
             InputParam("max_sequence_length", type_hint=int, default=512, required=False),
             InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(10, 20, 30), required=False),
-            InputParam("joint_attention_kwargs"),
         ]
 
     @property
@@ -99,14 +99,7 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):
     @staticmethod
     def check_inputs(block_state):
         prompt = block_state.prompt
-        prompt_embeds = getattr(block_state, "prompt_embeds", None)
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
-                "Please make sure to only forward one of the two."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
     @staticmethod
@@ -165,10 +158,6 @@ class Flux2TextEncoderStep(ModularPipelineBlocks):
 
         block_state.device = components._execution_device
 
-        if block_state.prompt_embeds is not None:
-            self.set_block_state(state, block_state)
-            return components, state
-
         prompt = block_state.prompt
         if prompt is None:
             prompt = ""
@@ -205,7 +194,6 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("prompt"),
-            InputParam("prompt_embeds", type_hint=torch.Tensor, required=False),
         ]
 
     @property
@@ -222,15 +210,8 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
     @staticmethod
     def check_inputs(block_state):
         prompt = block_state.prompt
-        prompt_embeds = getattr(block_state, "prompt_embeds", None)
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. "
-                "Please make sure to only forward one of the two."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(block_state.prompt)}")
 
     @torch.no_grad()
     def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
@@ -244,10 +225,6 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
 
         block_state.device = components._execution_device
 
-        if block_state.prompt_embeds is not None:
-            self.set_block_state(state, block_state)
-            return components, state
-
         prompt = block_state.prompt
         if prompt is None:
             prompt = ""
@@ -270,6 +247,289 @@ class Flux2RemoteTextEncoderStep(ModularPipelineBlocks):
         return components, state
 
 
+class Flux2KleinTextEncoderStep(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generates text embeddings using Qwen3 to guide the image generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3ForCausalLM),
+            ComponentSpec("tokenizer", Qwen2TokenizerFast),
+        ]
+
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(name="is_distilled", default=True),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
+            InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(9, 18, 27), required=False),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Text embeddings from qwen3 used to guide the image generation",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(block_state):
+        prompt = block_state.prompt
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux2.pipeline_flux2_klein.Flux2KleinPipeline._get_qwen3_prompt_embeds
+    def _get_qwen3_prompt_embeds(
+        text_encoder: Qwen3ForCausalLM,
+        tokenizer: Qwen2TokenizerFast,
+        prompt: Union[str, List[str]],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 512,
+        hidden_states_layers: List[int] = (9, 18, 27),
+    ):
+        dtype = text_encoder.dtype if dtype is None else dtype
+        device = text_encoder.device if device is None else device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        all_input_ids = []
+        all_attention_masks = []
+
+        for single_prompt in prompt:
+            messages = [{"role": "user", "content": single_prompt}]
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=max_sequence_length,
+            )
+
+            all_input_ids.append(inputs["input_ids"])
+            all_attention_masks.append(inputs["attention_mask"])
+
+        input_ids = torch.cat(all_input_ids, dim=0).to(device)
+        attention_mask = torch.cat(all_attention_masks, dim=0).to(device)
+
+        # Forward pass through the model
+        output = text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            use_cache=False,
+        )
+
+        # Only use outputs from intermediate layers and stack them
+        out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1)
+        out = out.to(dtype=dtype, device=device)
+
+        batch_size, num_channels, seq_len, hidden_dim = out.shape
+        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def __call__(self, components: Flux2KleinModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+
+        device = components._execution_device
+
+        prompt = block_state.prompt
+        if prompt is None:
+            prompt = ""
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        block_state.prompt_embeds = self._get_qwen3_prompt_embeds(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            device=device,
+            max_sequence_length=block_state.max_sequence_length,
+            hidden_states_layers=block_state.text_encoder_out_layers,
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class Flux2KleinBaseTextEncoderStep(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generates text embeddings using Qwen3 to guide the image generation"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3ForCausalLM),
+            ComponentSpec("tokenizer", Qwen2TokenizerFast),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(name="is_distilled", default=False),
+        ]
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("prompt"),
+            InputParam("max_sequence_length", type_hint=int, default=512, required=False),
+            InputParam("text_encoder_out_layers", type_hint=Tuple[int], default=(9, 18, 27), required=False),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Text embeddings from qwen3 used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Negative text embeddings from qwen3 used to guide the image generation",
+            ),
+        ]
+
+    @staticmethod
+    def check_inputs(block_state):
+        prompt = block_state.prompt
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    @staticmethod
+    # Copied from diffusers.pipelines.flux2.pipeline_flux2_klein.Flux2KleinPipeline._get_qwen3_prompt_embeds
+    def _get_qwen3_prompt_embeds(
+        text_encoder: Qwen3ForCausalLM,
+        tokenizer: Qwen2TokenizerFast,
+        prompt: Union[str, List[str]],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        max_sequence_length: int = 512,
+        hidden_states_layers: List[int] = (9, 18, 27),
+    ):
+        dtype = text_encoder.dtype if dtype is None else dtype
+        device = text_encoder.device if device is None else device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        all_input_ids = []
+        all_attention_masks = []
+
+        for single_prompt in prompt:
+            messages = [{"role": "user", "content": single_prompt}]
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                enable_thinking=False,
+            )
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=max_sequence_length,
+            )
+
+            all_input_ids.append(inputs["input_ids"])
+            all_attention_masks.append(inputs["attention_mask"])
+
+        input_ids = torch.cat(all_input_ids, dim=0).to(device)
+        attention_mask = torch.cat(all_attention_masks, dim=0).to(device)
+
+        # Forward pass through the model
+        output = text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            use_cache=False,
+        )
+
+        # Only use outputs from intermediate layers and stack them
+        out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1)
+        out = out.to(dtype=dtype, device=device)
+
+        batch_size, num_channels, seq_len, hidden_dim = out.shape
+        prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim)
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def __call__(self, components: Flux2KleinModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(block_state)
+
+        device = components._execution_device
+
+        prompt = block_state.prompt
+        if prompt is None:
+            prompt = ""
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        block_state.prompt_embeds = self._get_qwen3_prompt_embeds(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            device=device,
+            max_sequence_length=block_state.max_sequence_length,
+            hidden_states_layers=block_state.text_encoder_out_layers,
+        )
+
+        if components.requires_unconditional_embeds:
+            negative_prompt = [""] * len(prompt)
+            block_state.negative_prompt_embeds = self._get_qwen3_prompt_embeds(
+                text_encoder=components.text_encoder,
+                tokenizer=components.tokenizer,
+                prompt=negative_prompt,
+                device=device,
+                max_sequence_length=block_state.max_sequence_length,
+                hidden_states_layers=block_state.text_encoder_out_layers,
+            )
+        else:
+            block_state.negative_prompt_embeds = None
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class Flux2VaeEncoderStep(ModularPipelineBlocks):
     model_name = "flux2"
 
diff --git a/src/diffusers/modular_pipelines/flux2/inputs.py b/src/diffusers/modular_pipelines/flux2/inputs.py
index c9e337fb0b..3463de1999 100644
--- a/src/diffusers/modular_pipelines/flux2/inputs.py
+++ b/src/diffusers/modular_pipelines/flux2/inputs.py
@@ -47,7 +47,7 @@ class Flux2TextInputStep(ModularPipelineBlocks):
                 required=True,
                 kwargs_type="denoiser_input_fields",
                 type_hint=torch.Tensor,
-                description="Pre-generated text embeddings from Mistral3. Can be generated from text_encoder step.",
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
             ),
         ]
 
@@ -89,6 +89,90 @@ class Flux2TextInputStep(ModularPipelineBlocks):
         return components, state
 
 
+class Flux2KleinBaseTextInputStep(ModularPipelineBlocks):
+    model_name = "flux2-klein"
+
+    @property
+    def description(self) -> str:
+        return (
+            "This step:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
+        )
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam("num_images_per_prompt", default=1),
+            InputParam(
+                "prompt_embeds",
+                required=True,
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated text embeddings. Can be generated from text_encoder step.",
+            ),
+            InputParam(
+                "negative_prompt_embeds",
+                required=False,
+                kwargs_type="denoiser_input_fields",
+                type_hint=torch.Tensor,
+                description="Pre-generated negative text embeddings. Can be generated from text_encoder step.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> List[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds`)",
+            ),
+            OutputParam(
+                "prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Text embeddings used to guide the image generation",
+            ),
+            OutputParam(
+                "negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                kwargs_type="denoiser_input_fields",
+                description="Negative text embeddings used to guide the image generation",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: Flux2ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+        )
+
+        if block_state.negative_prompt_embeds is not None:
+            _, seq_len, _ = block_state.negative_prompt_embeds.shape
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(
+                1, block_state.num_images_per_prompt, 1
+            )
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
+                block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
 class Flux2ProcessImagesInputStep(ModularPipelineBlocks):
     model_name = "flux2"
 
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
similarity index 64%
rename from src/diffusers/modular_pipelines/flux2/modular_blocks.py
rename to src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
index a31673b6e7..41a0ff7dee 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_blocks.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2.py
@@ -12,16 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import List
+
+import PIL.Image
+import torch
+
 from ...utils import logging
 from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
-from ..modular_pipeline_utils import InsertableDict
+from ..modular_pipeline_utils import InsertableDict, OutputParam
 from .before_denoise import (
+    Flux2PrepareGuidanceStep,
     Flux2PrepareImageLatentsStep,
     Flux2PrepareLatentsStep,
     Flux2RoPEInputsStep,
     Flux2SetTimestepsStep,
 )
-from .decoders import Flux2DecodeStep
+from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
 from .denoise import Flux2DenoiseStep
 from .encoders import (
     Flux2RemoteTextEncoderStep,
@@ -41,7 +47,6 @@ Flux2VaeEncoderBlocks = InsertableDict(
     [
         ("preprocess", Flux2ProcessImagesInputStep()),
         ("encode", Flux2VaeEncoderStep()),
-        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
     ]
 )
 
@@ -72,33 +77,56 @@ class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
         )
 
 
-Flux2BeforeDenoiseBlocks = InsertableDict(
+Flux2CoreDenoiseBlocks = InsertableDict(
     [
+        ("input", Flux2TextInputStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_guidance", Flux2PrepareGuidanceStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2DenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
     ]
 )
 
 
-class Flux2BeforeDenoiseStep(SequentialPipelineBlocks):
+class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
     model_name = "flux2"
 
-    block_classes = Flux2BeforeDenoiseBlocks.values()
-    block_names = Flux2BeforeDenoiseBlocks.keys()
+    block_classes = Flux2CoreDenoiseBlocks.values()
+    block_names = Flux2CoreDenoiseBlocks.keys()
 
     @property
     def description(self):
-        return "Before denoise step that prepares the inputs for the denoise step in Flux2 generation."
+        return (
+            "Core denoise step that performs the denoising process for Flux2-dev.\n"
+            " - `Flux2TextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
+            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
+            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
+            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
+            " - `Flux2PrepareGuidanceStep` (prepare_guidance) prepares the guidance tensor for the denoising step.\n"
+            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
+            " - `Flux2DenoiseStep` (denoise) iteratively denoises the latents.\n"
+            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
 
 
 AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2TextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
-        ("before_denoise", Flux2BeforeDenoiseStep()),
-        ("denoise", Flux2DenoiseStep()),
+        ("vae_encoder", Flux2AutoVaeEncoderStep()),
+        ("denoise", Flux2CoreDenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
@@ -107,10 +135,8 @@ AUTO_BLOCKS = InsertableDict(
 REMOTE_AUTO_BLOCKS = InsertableDict(
     [
         ("text_encoder", Flux2RemoteTextEncoderStep()),
-        ("text_input", Flux2TextInputStep()),
-        ("vae_image_encoder", Flux2AutoVaeEncoderStep()),
-        ("before_denoise", Flux2BeforeDenoiseStep()),
-        ("denoise", Flux2DenoiseStep()),
+        ("vae_encoder", Flux2AutoVaeEncoderStep()),
+        ("denoise", Flux2CoreDenoiseStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
@@ -130,6 +156,16 @@ class Flux2AutoBlocks(SequentialPipelineBlocks):
             "- For image-conditioned generation, you need to provide `image` (list of PIL images)."
         )
 
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="images",
+                type_hint=List[PIL.Image.Image],
+                description="The images from the decoding step.",
+            )
+        ]
+
 
 TEXT2IMAGE_BLOCKS = InsertableDict(
     [
@@ -137,8 +173,10 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
         ("text_input", Flux2TextInputStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_guidance", Flux2PrepareGuidanceStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
         ("denoise", Flux2DenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
@@ -152,8 +190,10 @@ IMAGE_CONDITIONED_BLOCKS = InsertableDict(
         ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
         ("prepare_latents", Flux2PrepareLatentsStep()),
         ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_guidance", Flux2PrepareGuidanceStep()),
         ("prepare_rope_inputs", Flux2RoPEInputsStep()),
         ("denoise", Flux2DenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
         ("decode", Flux2DecodeStep()),
     ]
 )
diff --git a/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
new file mode 100644
index 0000000000..984832d77b
--- /dev/null
+++ b/src/diffusers/modular_pipelines/flux2/modular_blocks_flux2_klein.py
@@ -0,0 +1,232 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import PIL.Image
+import torch
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    Flux2KleinBaseRoPEInputsStep,
+    Flux2PrepareImageLatentsStep,
+    Flux2PrepareLatentsStep,
+    Flux2RoPEInputsStep,
+    Flux2SetTimestepsStep,
+)
+from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
+from .denoise import Flux2KleinBaseDenoiseStep, Flux2KleinDenoiseStep
+from .encoders import (
+    Flux2KleinBaseTextEncoderStep,
+    Flux2KleinTextEncoderStep,
+    Flux2VaeEncoderStep,
+)
+from .inputs import (
+    Flux2KleinBaseTextInputStep,
+    Flux2ProcessImagesInputStep,
+    Flux2TextInputStep,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+################
+# VAE encoder
+################
+
+Flux2KleinVaeEncoderBlocks = InsertableDict(
+    [
+        ("preprocess", Flux2ProcessImagesInputStep()),
+        ("encode", Flux2VaeEncoderStep()),
+    ]
+)
+
+
+class Flux2KleinVaeEncoderSequentialStep(SequentialPipelineBlocks):
+    model_name = "flux2"
+
+    block_classes = Flux2KleinVaeEncoderBlocks.values()
+    block_names = Flux2KleinVaeEncoderBlocks.keys()
+
+    @property
+    def description(self) -> str:
+        return "VAE encoder step that preprocesses and encodes the image inputs into their latent representations."
+
+
+class Flux2KleinAutoVaeEncoderStep(AutoPipelineBlocks):
+    block_classes = [Flux2KleinVaeEncoderSequentialStep]
+    block_names = ["img_conditioning"]
+    block_trigger_inputs = ["image"]
+
+    @property
+    def description(self):
+        return (
+            "VAE encoder step that encodes the image inputs into their latent representations.\n"
+            "This is an auto pipeline block that works for image conditioning tasks.\n"
+            " - `Flux2KleinVaeEncoderSequentialStep` is used when `image` is provided.\n"
+            " - If `image` is not provided, step will be skipped."
+        )
+
+
+###
+### Core denoise
+###
+
+Flux2KleinCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2TextInputStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2RoPEInputsStep()),
+        ("denoise", Flux2KleinDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+class Flux2KleinCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+
+    block_classes = Flux2KleinCoreDenoiseBlocks.values()
+    block_names = Flux2KleinCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that performs the denoising process for Flux2-Klein (distilled model).\n"
+            " - `Flux2KleinTextInputStep` (input) standardizes the text inputs (prompt_embeds) for the denoising step.\n"
+            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents  and image_latent_ids for the denoising step.\n"
+            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
+            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
+            " - `Flux2RoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids) for the denoising step.\n"
+            " - `Flux2KleinDenoiseStep` (denoise) iteratively denoises the latents.\n"
+            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+
+Flux2KleinBaseCoreDenoiseBlocks = InsertableDict(
+    [
+        ("input", Flux2KleinBaseTextInputStep()),
+        ("prepare_latents", Flux2PrepareLatentsStep()),
+        ("prepare_image_latents", Flux2PrepareImageLatentsStep()),
+        ("set_timesteps", Flux2SetTimestepsStep()),
+        ("prepare_rope_inputs", Flux2KleinBaseRoPEInputsStep()),
+        ("denoise", Flux2KleinBaseDenoiseStep()),
+        ("after_denoise", Flux2UnpackLatentsStep()),
+    ]
+)
+
+
+class Flux2KleinBaseCoreDenoiseStep(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = Flux2KleinBaseCoreDenoiseBlocks.values()
+    block_names = Flux2KleinBaseCoreDenoiseBlocks.keys()
+
+    @property
+    def description(self):
+        return "Core denoise step that performs the denoising process for Flux2-Klein (base model)."
+        return (
+            "Core denoise step that performs the denoising process for Flux2-Klein (base model).\n"
+            " - `Flux2KleinBaseTextInputStep` (input) standardizes the text inputs (prompt_embeds + negative_prompt_embeds) for the denoising step.\n"
+            " - `Flux2PrepareImageLatentsStep` (prepare_image_latents) prepares the image latents and image_latent_ids for the denoising step.\n"
+            " - `Flux2PrepareLatentsStep` (prepare_latents) prepares the initial latents (latents) and latent_ids for the denoising step.\n"
+            " - `Flux2SetTimestepsStep` (set_timesteps) sets the timesteps for the denoising step.\n"
+            " - `Flux2KleinBaseRoPEInputsStep` (prepare_rope_inputs) prepares the RoPE inputs (txt_ids + negative_txt_ids) for the denoising step.\n"
+            " - `Flux2KleinBaseDenoiseStep` (denoise) iteratively denoises the latents using Classifier-Free Guidance.\n"
+            " - `Flux2UnpackLatentsStep` (after_denoise) unpacks the latents from the denoising step.\n"
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The latents from the denoising step.",
+            )
+        ]
+
+
+###
+### Auto blocks
+###
+class Flux2KleinAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = [
+        Flux2KleinTextEncoderStep(),
+        Flux2KleinAutoVaeEncoderStep(),
+        Flux2KleinCoreDenoiseStep(),
+        Flux2DecodeStep(),
+    ]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return (
+            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein.\n"
+            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
+            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="images",
+                type_hint=List[PIL.Image.Image],
+                description="The images from the decoding step.",
+            )
+        ]
+
+
+class Flux2KleinBaseAutoBlocks(SequentialPipelineBlocks):
+    model_name = "flux2-klein"
+    block_classes = [
+        Flux2KleinBaseTextEncoderStep(),
+        Flux2KleinAutoVaeEncoderStep(),
+        Flux2KleinBaseCoreDenoiseStep(),
+        Flux2DecodeStep(),
+    ]
+    block_names = ["text_encoder", "vae_encoder", "denoise", "decode"]
+
+    @property
+    def description(self):
+        return (
+            "Auto blocks that perform the text-to-image and image-conditioned generation using Flux2-Klein (base model).\n"
+            + " - for image-conditioned generation, you need to provide `image` (list of PIL images).\n"
+            + " - for text-to-image generation, all you need to provide is `prompt`.\n"
+        )
+
+    @property
+    def outputs(self):
+        return [
+            OutputParam(
+                name="images",
+                type_hint=List[PIL.Image.Image],
+                description="The images from the decoding step.",
+            )
+        ]
diff --git a/src/diffusers/modular_pipelines/flux2/modular_pipeline.py b/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
index 3e497f3b1e..29fbeba07c 100644
--- a/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/flux2/modular_pipeline.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+from typing import Any, Dict, Optional
+
 from ...loaders import Flux2LoraLoaderMixin
 from ...utils import logging
 from ..modular_pipeline import ModularPipeline
@@ -55,3 +57,56 @@ class Flux2ModularPipeline(ModularPipeline, Flux2LoraLoaderMixin):
         if getattr(self, "transformer", None):
             num_channels_latents = self.transformer.config.in_channels // 4
         return num_channels_latents
+
+
+class Flux2KleinModularPipeline(ModularPipeline, Flux2LoraLoaderMixin):
+    """
+    A ModularPipeline for Flux2-Klein.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "Flux2KleinBaseAutoBlocks"
+
+    def get_default_blocks_name(self, config_dict: Optional[Dict[str, Any]]) -> Optional[str]:
+        if config_dict is not None and "is_distilled" in config_dict and config_dict["is_distilled"]:
+            return "Flux2KleinAutoBlocks"
+        else:
+            return "Flux2KleinBaseAutoBlocks"
+
+    @property
+    def default_height(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_width(self):
+        return self.default_sample_size * self.vae_scale_factor
+
+    @property
+    def default_sample_size(self):
+        return 128
+
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if getattr(self, "vae", None) is not None:
+            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        num_channels_latents = 32
+        if getattr(self, "transformer", None):
+            num_channels_latents = self.transformer.config.in_channels // 4
+        return num_channels_latents
+
+    @property
+    def requires_unconditional_embeds(self):
+        if hasattr(self.config, "is_distilled") and self.config.is_distilled:
+            return False
+
+        requires_unconditional_embeds = False
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index d857fd0409..98ede73c21 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -59,6 +59,7 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
         ("flux", "FluxModularPipeline"),
         ("flux-kontext", "FluxKontextModularPipeline"),
         ("flux2", "Flux2ModularPipeline"),
+        ("flux2-klein", "Flux2KleinModularPipeline"),
         ("qwenimage", "QwenImageModularPipeline"),
         ("qwenimage-edit", "QwenImageEditModularPipeline"),
         ("qwenimage-edit-plus", "QwenImageEditPlusModularPipeline"),
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 63f381419f..a23f852616 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -17,6 +17,51 @@ class Flux2AutoBlocks(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class Flux2KleinAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Flux2KleinBaseAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Flux2KleinModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class Flux2ModularPipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
new file mode 100644
index 0000000000..26653b20f8
--- /dev/null
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import PIL
+import pytest
+
+from diffusers.modular_pipelines import (
+    Flux2KleinAutoBlocks,
+    Flux2KleinModularPipeline,
+)
+
+from ...testing_utils import floats_tensor, torch_device
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinModularPipeline
+    pipeline_blocks_class = Flux2KleinAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
+
+    params = frozenset(["prompt", "height", "width"])
+    batch_params = frozenset(["prompt"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+
+class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinModularPipeline
+    pipeline_blocks_class = Flux2KleinAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-modular"
+
+    params = frozenset(["prompt", "height", "width", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = PIL.Image.fromarray(np.uint8(image * 255)).convert("RGB")
+        inputs["image"] = init_image
+
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+    @pytest.mark.skip(reason="batched inference is currently not supported")
+    def test_inference_batch_single_identical(self, batch_size=2, expected_max_diff=0.0001):
+        return
diff --git a/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
new file mode 100644
index 0000000000..701dd0fed8
--- /dev/null
+++ b/tests/modular_pipelines/flux2/test_modular_pipeline_flux2_klein_base.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import numpy as np
+import PIL
+import pytest
+
+from diffusers.modular_pipelines import (
+    Flux2KleinBaseAutoBlocks,
+    Flux2KleinModularPipeline,
+)
+
+from ...testing_utils import floats_tensor, torch_device
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+class TestFlux2ModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinModularPipeline
+    pipeline_blocks_class = Flux2KleinBaseAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
+
+    params = frozenset(["prompt", "height", "width"])
+    batch_params = frozenset(["prompt"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+
+class TestFlux2ImageConditionedModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = Flux2KleinModularPipeline
+    pipeline_blocks_class = Flux2KleinBaseAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux2-klein-base-modular"
+
+    params = frozenset(["prompt", "height", "width", "image"])
+    batch_params = frozenset(["prompt", "image"])
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            # TODO (Dhruv): Update text encoder config so that vocab_size matches tokenizer
+            "max_sequence_length": 8,  # bit of a hack to workaround vocab size mismatch
+            "text_encoder_out_layers": (1,),
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 32,
+            "width": 32,
+            "output_type": "pt",
+        }
+        image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(torch_device)
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        init_image = PIL.Image.fromarray(np.uint8(image * 255)).convert("RGB")
+        inputs["image"] = init_image
+
+        return inputs
+
+    def test_float16_inference(self):
+        super().test_float16_inference(9e-2)
+
+    @pytest.mark.skip(reason="batched inference is currently not supported")
+    def test_inference_batch_single_identical(self, batch_size=2, expected_max_diff=0.0001):
+        return

From ef913010d4a337e1696cf50ff0cbb008e79c6b67 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 28 Jan 2026 15:44:12 +0530
Subject: [PATCH 08/10] [QwenImage] fix prompt isolation tests (#13042)

* up

* up

* up

* fix
---
 .../train_dreambooth_lora_qwen_image.py       |  3 ++-
 .../pipelines/qwenimage/pipeline_qwenimage.py | 22 ++++++-----------
 .../pipeline_qwenimage_controlnet.py          | 19 +++++----------
 .../pipeline_qwenimage_edit_inpaint.py        |  8 -------
 .../qwenimage/pipeline_qwenimage_img2img.py   | 24 +++++++------------
 .../qwenimage/pipeline_qwenimage_inpaint.py   | 23 +++++++-----------
 .../qwenimage/pipeline_qwenimage_layered.py   | 13 +++++-----
 7 files changed, 38 insertions(+), 74 deletions(-)

diff --git a/examples/dreambooth/train_dreambooth_lora_qwen_image.py b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
index ea9b137b0a..33a1054eff 100644
--- a/examples/dreambooth/train_dreambooth_lora_qwen_image.py
+++ b/examples/dreambooth/train_dreambooth_lora_qwen_image.py
@@ -1467,7 +1467,8 @@ def main(args):
                 else:
                     num_repeat_elements = len(prompts)
                     prompt_embeds = prompt_embeds.repeat(num_repeat_elements, 1, 1)
-                    prompt_embeds_mask = prompt_embeds_mask.repeat(num_repeat_elements, 1)
+                    if prompt_embeds_mask is not None:
+                        prompt_embeds_mask = prompt_embeds_mask.repeat(num_repeat_elements, 1)
                 # Convert images to latent space
                 if args.cache_latents:
                     model_input = latents_cache[step].sample()
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
index 21515df608..9938ae95fd 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage.py
@@ -254,16 +254,17 @@ class QwenImagePipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
 
         prompt_embeds = prompt_embeds[:, :max_sequence_length]
-        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
-
         _, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
-        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
-            prompt_embeds_mask = None
+        if prompt_embeds_mask is not None:
+            prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
+            prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+            if prompt_embeds_mask.all():
+                prompt_embeds_mask = None
 
         return prompt_embeds, prompt_embeds_mask
 
@@ -310,15 +311,6 @@ class QwenImagePipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
         if max_sequence_length is not None and max_sequence_length > 1024:
             raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
 
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
index 714ec04238..e0cb9924cd 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_controlnet.py
@@ -321,11 +321,13 @@ class QwenImageControlNetPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
         _, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
-        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
-            prompt_embeds_mask = None
+        if prompt_embeds_mask is not None:
+            prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+            if prompt_embeds_mask.all():
+                prompt_embeds_mask = None
 
         return prompt_embeds, prompt_embeds_mask
 
@@ -372,15 +374,6 @@ class QwenImageControlNetPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
         if max_sequence_length is not None and max_sequence_length > 1024:
             raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
 
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
index 75be62cf6d..bc397f357b 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_edit_inpaint.py
@@ -378,14 +378,6 @@ class QwenImageEditInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
         if padding_mask_crop is not None:
             if not isinstance(image, PIL.Image.Image):
                 raise ValueError(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
index 2c9da7545e..522e1d2030 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_img2img.py
@@ -265,7 +265,7 @@ class QwenImageImg2ImgPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
 
         return timesteps, num_inference_steps - t_start
 
-    # Copied fromCopied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -297,16 +297,17 @@ class QwenImageImg2ImgPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
 
         prompt_embeds = prompt_embeds[:, :max_sequence_length]
-        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
-
         _, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
-        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
-            prompt_embeds_mask = None
+        if prompt_embeds_mask is not None:
+            prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
+            prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+            if prompt_embeds_mask.all():
+                prompt_embeds_mask = None
 
         return prompt_embeds, prompt_embeds_mask
 
@@ -357,15 +358,6 @@ class QwenImageImg2ImgPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
         if max_sequence_length is not None and max_sequence_length > 1024:
             raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
 
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
index 536a7984e8..960196bec1 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_inpaint.py
@@ -276,7 +276,7 @@ class QwenImageInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
 
         return timesteps, num_inference_steps - t_start
 
-    # Copied fromCopied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -308,16 +308,17 @@ class QwenImageInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
             prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
 
         prompt_embeds = prompt_embeds[:, :max_sequence_length]
-        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
-
         _, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
-        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
-            prompt_embeds_mask = None
+        if prompt_embeds_mask is not None:
+            prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
+            prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+            if prompt_embeds_mask.all():
+                prompt_embeds_mask = None
 
         return prompt_embeds, prompt_embeds_mask
 
@@ -372,14 +373,6 @@ class QwenImageInpaintPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if prompt_embeds is not None and prompt_embeds_mask is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
         if padding_mask_crop is not None:
             if not isinstance(image, PIL.Image.Image):
                 raise ValueError(
diff --git a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
index 0a53a0ac77..ea455721e4 100644
--- a/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
+++ b/src/diffusers/pipelines/qwenimage/pipeline_qwenimage_layered.py
@@ -320,16 +320,17 @@ the image\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>\n<|im_start|>as
             prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
 
         prompt_embeds = prompt_embeds[:, :max_sequence_length]
-        prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
-
         _, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
         prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
 
-        if prompt_embeds_mask is not None and prompt_embeds_mask.all():
-            prompt_embeds_mask = None
+        if prompt_embeds_mask is not None:
+            prompt_embeds_mask = prompt_embeds_mask[:, :max_sequence_length]
+            prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+            prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+
+            if prompt_embeds_mask.all():
+                prompt_embeds_mask = None
 
         return prompt_embeds, prompt_embeds_mask
 

From 2ac39ba664acfbaad554f1d7c7e29a0879efb357 Mon Sep 17 00:00:00 2001
From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>
Date: Wed, 28 Jan 2026 12:43:04 +0100
Subject: [PATCH 09/10] fast tok update (#13036)

* v5 tok update

* ruff

* keep pre v5 slow code path

* Apply style fixes

---------

Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 src/diffusers/loaders/textual_inversion.py | 50 +++++++++++++++-------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/src/diffusers/loaders/textual_inversion.py b/src/diffusers/loaders/textual_inversion.py
index 63fc97ed43..e4346700a7 100644
--- a/src/diffusers/loaders/textual_inversion.py
+++ b/src/diffusers/loaders/textual_inversion.py
@@ -11,11 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 from typing import Dict, List, Optional, Union
 
 import safetensors
 import torch
 from huggingface_hub.utils import validate_hf_hub_args
+from tokenizers import Tokenizer as TokenizerFast
 from torch import nn
 
 from ..models.modeling_utils import load_state_dict
@@ -547,23 +549,39 @@ class TextualInversionLoaderMixin:
                 else:
                     last_special_token_id = added_token_id
 
-        # Delete from tokenizer
-        for token_id, token_to_remove in zip(token_ids, tokens):
-            del tokenizer._added_tokens_decoder[token_id]
-            del tokenizer._added_tokens_encoder[token_to_remove]
-
-        # Make all token ids sequential in tokenizer
-        key_id = 1
-        for token_id in tokenizer.added_tokens_decoder:
-            if token_id > last_special_token_id and token_id > last_special_token_id + key_id:
-                token = tokenizer._added_tokens_decoder[token_id]
-                tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token
+        # Fast tokenizers (v5+)
+        if hasattr(tokenizer, "_tokenizer"):
+            # Fast tokenizers: serialize, filter tokens, reload
+            tokenizer_json = json.loads(tokenizer._tokenizer.to_str())
+            new_id = last_special_token_id + 1
+            filtered = []
+            for tok in tokenizer_json.get("added_tokens", []):
+                if tok.get("content") in set(tokens):
+                    continue
+                if not tok.get("special", False):
+                    tok["id"] = new_id
+                    new_id += 1
+                filtered.append(tok)
+            tokenizer_json["added_tokens"] = filtered
+            tokenizer._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
+        else:
+            # Slow tokenizers
+            for token_id, token_to_remove in zip(token_ids, tokens):
                 del tokenizer._added_tokens_decoder[token_id]
-                tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id
-                key_id += 1
-        tokenizer._update_trie()
-        # set correct total vocab size after removing tokens
-        tokenizer._update_total_vocab_size()
+                del tokenizer._added_tokens_encoder[token_to_remove]
+
+            key_id = 1
+            for token_id in list(tokenizer.added_tokens_decoder.keys()):
+                if token_id > last_special_token_id and token_id > last_special_token_id + key_id:
+                    token = tokenizer._added_tokens_decoder[token_id]
+                    tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token
+                    del tokenizer._added_tokens_decoder[token_id]
+                    tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id
+                    key_id += 1
+            if hasattr(tokenizer, "_update_trie"):
+                tokenizer._update_trie()
+            if hasattr(tokenizer, "_update_total_vocab_size"):
+                tokenizer._update_total_vocab_size()
 
         # Delete from text encoder
         text_embedding_dim = text_encoder.get_input_embeddings().embedding_dim

From 2c669e84808f40ac5b5026fa44e948cfbc325119 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 28 Jan 2026 17:22:27 +0530
Subject: [PATCH 10/10] change to CUDA 12.9. (#13045)

* change to CUDA 12.9.

* up

* change runtime base

* FROM
---
 docker/diffusers-pytorch-cuda/Dockerfile          | 12 ++++++++----
 docker/diffusers-pytorch-xformers-cuda/Dockerfile | 12 ++++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
index 134b47215d..0c639f8532 100644
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get -y update \
@@ -36,8 +36,12 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
     torch \
     torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu121
+    torchaudio
+
+# Install compatible versions of numba/llvmlite for Python 3.10+
+RUN uv pip install --no-cache-dir \
+    "llvmlite>=0.40.0" \
+    "numba>=0.57.0"
 
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"
 
diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
index 7714821c82..f40df34a4c 100644
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04
+FROM nvidia/cuda:12.9.0-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="diffusers"
 
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.10
 ENV DEBIAN_FRONTEND=noninteractive
 
 RUN apt-get -y update \
@@ -36,8 +36,12 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN uv pip install --no-cache-dir \
     torch \
     torchvision \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu121
+    torchaudio
+
+# Install compatible versions of numba/llvmlite for Python 3.10+
+RUN uv pip install --no-cache-dir \
+    "llvmlite>=0.40.0" \
+    "numba>=0.57.0"
 
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/diffusers.git@main#egg=diffusers[test]"