From aea0d046f6eb759dca55a11bd9c55f89db39b3e4 Mon Sep 17 00:00:00 2001
From: yiyixuxu <yixu310@gmail.com>
Date: Sat, 17 Jan 2026 09:36:58 +0100
Subject: [PATCH] address feedbacks

---
 .../modular_pipeline_utils.py                 |   4 +-
 .../qwenimage/modular_blocks_qwenimage.py     | 408 ++++--------------
 .../modular_blocks_qwenimage_edit.py          | 256 +++--------
 .../modular_blocks_qwenimage_edit_plus.py     | 147 ++-----
 .../modular_blocks_qwenimage_layered.py       | 190 +++-----
 utils/modular_auto_docstring.py               |  16 +-
 6 files changed, 271 insertions(+), 750 deletions(-)

diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index fab7c7193e..368fbbcbd1 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -711,7 +711,7 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
 
         formatted_params.append(param_str)
 
-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)
 
 
 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
         loading_field_values = []
         for field_name in component.loading_fields():
             field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                 loading_field_values.append(f"{field_name}={field_value}")
 
         # Add loading field information if available
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 85b77c2a6b..3bd4ae5683 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -62,50 +62,44 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageAutoTextEncoderStep
-
-      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -130,48 +124,36 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for inpainting tasks. It:
+    This step is used for processing image and mask inputs for inpainting tasks. It:
        - Resizes the image to the target size, based on `height` and `width`.
        - Processes and updates `image` and `mask_image`.
        - Creates `image_latents`.
 
       Components:
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -193,34 +175,26 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgVaeEncoderStep
-
-      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+    Vae encoder step that preprocess andencode the image inputs into their latent representations.
 
       Components:
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -255,36 +229,30 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
 # auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
     """
-    class QwenImageOptionalControlNetVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
+    Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
        - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
        - if `control_image` is not provided, step will be skipped.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           control_image_latents (`Tensor`):
               The latents representing the control image
     """
@@ -312,46 +280,32 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
 # auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgInputStep
-
-      Input step that prepares the inputs for the img2img denoising step. It:
+    Input step that prepares the inputs for the img2img denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -370,48 +324,33 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintInputStep
-
-      Input step that prepares the inputs for the inpainting denoising step. It:
+    Input step that prepares the inputs for the inpainting denoising step. It:
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -436,44 +375,32 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the pachified latents `mask` based on the processedmask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -498,60 +425,43 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -589,67 +499,47 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -689,65 +579,46 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -787,74 +658,53 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetCoreDenoiseStep
-
-      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
-      inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           control_image_latents (`None`):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -896,81 +746,57 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetInpaintCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1014,79 +840,56 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageControlNetImg2ImgCoreDenoiseStep
-
-      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
-      img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           control_image_latents (`None`):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           **denoiser_input_fields (`None`, *optional*):
               All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
               txt_seq_lens/negative_txt_seq_lens.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -1196,26 +999,21 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
 # auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1233,29 +1031,22 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
-      overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -1302,131 +1093,102 @@ AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageAutoBlocks
-
-      Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
+    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
       - for image-to-image generation, you need to provide `image`
-      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.
       - to run the controlnet workflow, you need to provide `control_image`
       - for text-to-image generation, all you need to provide is `prompt`
 
       Components:
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          controlnet (`QwenImageControlNetModel`) [subfolder=]
+          controlnet (`QwenImageControlNetModel`)
 
-          control_image_processor (`VaeImageProcessor`) [subfolder=]
+          control_image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           prompt (`str`, *optional*):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           image (`Image`, *optional*):
               Input image for img2img, editing, or conditioning.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           control_image (`Image`, *optional*):
               Control image for ControlNet conditioning.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           control_image_latents (`None`, *optional*):
-
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
-
           control_guidance_end (`float`, *optional*, defaults to 1.0):
               When to stop applying ControlNet.
-
           controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
               Scale for ControlNet conditioning.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 3fcbc8853f..627cfce6ee 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -59,55 +59,46 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVLEncoderStep
-
-      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -133,33 +124,26 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -181,47 +165,36 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
        - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
        - process the resized image and mask image.
        - create image latents.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           mask_image (`Image`):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           processed_mask_image (`None`):
-
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -270,48 +243,34 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
 # auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInputStep
-
-      Input step that prepares the inputs for the edit denoising step. It:
+    Input step that prepares the inputs for the edit denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -335,50 +294,35 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintInputStep
-
-      Input step that prepares the inputs for the edit inpaint denoising step. It:
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
     """
@@ -405,44 +349,32 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
        - Add noise to the image latents to create the latents input for the denoiser.
        - Create the patchified latents `mask` based on the processed mask image.
 
       Components:
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
-
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
-
           height (`None`):
-
           width (`None`):
-
           dtype (`None`):
 
       Outputs:
-
           initial_noise (`Tensor`):
               The initial random noised used for inpainting denoising.
-
           mask (`Tensor`):
               The mask to use for the inpainting process.
     """
@@ -464,61 +396,44 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -556,66 +471,47 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit inpaint task.
+    Core denoising workflow for QwenImage-Edit edit inpaint task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -694,26 +590,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
 # auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -731,29 +622,22 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -806,103 +690,81 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           mask_image (`Image`, *optional*):
               Mask image for inpainting.
-
           padding_mask_crop (`int`, *optional*):
               Padding for mask cropping in inpainting.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`):
               The height in pixels of the generated image.
-
           width (`int`):
               The width in pixels of the generated image.
-
           image_latents (`None`):
-
           processed_mask_image (`None`, *optional*):
-
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
-
           mask_overlay_kwargs (`None`, *optional*):
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index 0364e394d2..cc07fc1e6a 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -52,57 +52,48 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVLEncoderStep
-
-      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
 
       Outputs:
-
           resized_cond_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -127,34 +118,27 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusVaeEncoderStep
-
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -184,9 +168,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusInputStep
-
-      Input step that prepares the inputs for the Edit Plus denoising step. It:
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
        - Standardizes text embeddings batch size.
        - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
        - Outputs lists of image_height/image_width for RoPE calculation.
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`List`):
               The image heights calculated from the image latents dimension
-
           image_width (`List`):
               The image widths calculated from the image latents dimension
     """
@@ -254,61 +224,44 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
 
       Components:
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -350,26 +303,21 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusDecodeStep
-
-      Decode step that decodes the latents to images and postprocesses the generated image.
+    Decode step that decodes the latents to images and postprocesses the generated image.
 
       Components:
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
       Inputs:
-
           latents (`Tensor`):
               The latents to decode, can be generated in the denoise step
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
@@ -400,88 +348,73 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageEditPlusAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
       - `image` is required input (can be single image or list of images).
       - Each image is resized independently based on its own aspect ratio.
       - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
           prompt_template_encode_start_idx (default: 64)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           prompt (`str`):
               The prompt or prompts to guide image generation.
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           height (`int`, *optional*):
               The height in pixels of the generated image.
-
           width (`int`, *optional*):
               The width in pixels of the generated image.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 5602fc9b93..7cbc174871 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,43 +53,45 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredTextEncoderStep
-
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           prompt_embeds (`Tensor`):
               The prompt embeddings
-
           prompt_embeds_mask (`Tensor`):
               The encoder attention mask
-
           negative_prompt_embeds (`Tensor`):
               The negative prompt embeddings
-
           negative_prompt_embeds_mask (`Tensor`):
               The negative prompt embeddings mask
     """
@@ -169,36 +165,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
 
       Outputs:
-
           resized_image (`List`):
               The resized images
-
           processed_image (`None`):
-
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,48 +214,34 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredInputStep
-
-      Input step that prepares the inputs for the layered denoising step. It:
+    Input step that prepares the inputs for the layered denoising step. It:
        - make sure the text embeddings have consistent batch size as well as the additional inputs.
        - update height/width based `image_latents`, patchify `image_latents`.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
 
       Outputs:
-
           batch_size (`int`):
               Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
           dtype (`dtype`):
               Data type of model tensor inputs (determined by `prompt_embeds`)
-
           image_height (`int`):
               The image height calculated from the image latents dimension
-
           image_width (`int`):
               The image width calculated from the image latents dimension
-
           height (`int`):
               The height of the image output
-
           width (`int`):
               The width of the image output
     """
@@ -292,58 +266,42 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Layered img2img task.
+    Core denoising workflow for QwenImage-Layered img2img task.
 
       Components:
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Inputs:
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           prompt_embeds (`None`):
-
           prompt_embeds_mask (`None`):
-
           negative_prompt_embeds (`None`, *optional*):
-
           negative_prompt_embeds_mask (`None`, *optional*):
-
           image_latents (`None`, *optional*):
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
 
       Outputs:
-
           latents (`Tensor`):
               Denoised latents.
     """
@@ -394,52 +352,55 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     """
-    class QwenImageLayeredAutoBlocks
-
-      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
 
       Components:
 
-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)
 
-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)
 
-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)
 
-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use
 
-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)
 
-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)
 
-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)
 
-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)
 
-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
 
-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)
 
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
 
           prompt_template_encode_start_idx (default: 34)
 
           tokenizer_max_length (default: 1024)
 
       Inputs:
-
           image (`Image`):
               Input image for img2img, editing, or conditioning.
-
           resolution (`int`, *optional*, defaults to 640):
               The target area to resize the image to, can be 1024 or 640
-
           prompt (`str`, *optional*):
               The prompt to encode
-
           use_en_prompt (`bool`, *optional*, defaults to False):
               Whether to use English prompt template
-
           negative_prompt (`str`, *optional*):
               The prompt or prompts not to guide the image generation.
-
           max_sequence_length (`int`, *optional*, defaults to 1024):
               Maximum sequence length for prompt encoding.
-
           generator (`Generator`, *optional*):
               Torch generator for deterministic generation.
-
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
-
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
-
           layers (`int`, *optional*, defaults to 4):
               Number of layers to extract from the image
-
           num_inference_steps (`int`, *optional*, defaults to 50):
               The number of denoising steps.
-
           sigmas (`List`, *optional*):
               Custom sigmas for the denoising process.
-
           attention_kwargs (`Dict`, *optional*):
               Additional kwargs for attention processors.
-
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
 
       Outputs:
-
           images (`List`):
               Generated images.
     """
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index e2d523b2f3..01d984a584 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
     return classes_to_update
 
 
+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
 def format_docstring(doc: str, indent: str = "    ") -> str:
     """Format a doc string as a properly indented docstring."""
     lines = doc.strip().split("\n")
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
             print(f"Warning: Could not get doc for {class_name} in {filepath}")
             continue
 
+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
         # Format the new docstring with 4-space indent
         new_docstring = format_docstring(doc, "    ")
 
@@ -283,4 +297,4 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
+    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file