diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
index 368fbbcbd1..45556c538a 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -708,6 +708,8 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
             desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
             wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
             param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."
 
         formatted_params.append(param_str)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
index f0dd6471b1..8d7b190542 100644
--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1324,7 +1324,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
+            InputParam.template(self._image_input_name)
+            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
             InputParam.generator(),
         ]
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
index 3bd4ae5683..645c01f66e 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
 
       Outputs:
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -425,7 +440,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           height (`int`, *optional*):
@@ -499,7 +519,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -579,7 +606,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -658,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).
 
       Components:
 
@@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
@@ -746,7 +785,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.
 
       Components:
 
@@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -840,7 +887,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
     """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.
 
       Components:
 
@@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           control_image_latents (`None`):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -1031,7 +1085,8 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.
 
       Components:
 
@@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           **denoiser_input_fields (`Tensor`, *optional*):
               conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           strength (`float`, *optional*, defaults to 0.9):
               Strength for img2img/inpainting.
           control_image_latents (`None`, *optional*):
+              TODO: Add description.
           control_guidance_start (`float`, *optional*, defaults to 0.0):
               When to start applying ControlNet.
           control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
index 627cfce6ee..0bfbb921c9 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           processed_mask_image (`None`):
+              TODO: Add description.
           mask_overlay_kwargs (`Dict`):
               The kwargs for the postprocess step to apply the mask overlay
           image_latents (`Tensor`):
@@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
           latents (`Tensor`):
               The initial random noised, can be generated in prepare latent step.
           image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
           timesteps (`Tensor`):
               The timesteps to use for the denoising process. Can be generated in set_timesteps step.
           processed_mask_image (`Tensor`):
               The processed mask to use for the inpainting process.
           height (`None`):
+              TODO: Add description.
           width (`None`):
+              TODO: Add description.
           dtype (`None`):
+              TODO: Add description.
 
       Outputs:
           initial_noise (`Tensor`):
@@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -622,7 +650,8 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
     """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.
 
       Components:
 
@@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
@@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
     """
     Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
       - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`
 
       Components:
 
@@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 64)
 
@@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           width (`int`):
               The width in pixels of the generated image.
           image_latents (`None`):
+              TODO: Add description.
           processed_mask_image (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`):
               Pre-generated noisy latents for image generation.
           num_inference_steps (`int`):
@@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
           output_type (`str`, *optional*, defaults to pil):
               Output format: 'pil', 'np', 'pt''.
           mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           images (`List`):
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
index cc07fc1e6a..8dab6fbcf9 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
@@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           height (`int`, *optional*):
               The height in pixels of the generated image.
           width (`int`, *optional*):
               The width in pixels of the generated image.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           generator (`Generator`, *optional*):
@@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )
 
           img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
 
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
index 7cbc174871..544b1abfc3 100644
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,7 +53,8 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.
 
       Components:
 
@@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
@@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
           resized_image (`List`):
               The resized images
           processed_image (`None`):
+              TODO: Add description.
           image_latents (`Tensor`):
               The latents representing the reference image(s). Single tensor or list depending on input.
     """
@@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
 
       Outputs:
           batch_size (`int`):
@@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
           num_images_per_prompt (`int`, *optional*, defaults to 1):
               The number of images to generate per prompt.
           prompt_embeds (`None`):
+              TODO: Add description.
           prompt_embeds_mask (`None`):
+              TODO: Add description.
           negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
           negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
           image_latents (`None`, *optional*):
+              TODO: Add description.
           latents (`Tensor`, *optional*):
               Pre-generated noisy latents for image generation.
           layers (`int`, *optional*, defaults to 4):
@@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
       Configs:
 
           image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
     1. Write the caption using natural, descriptive language without structured formats or rich text.
     2. Enrich caption details by including:
      - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
      - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
     3. Maintain authenticity and accuracy:
      - Avoid generalizations
      - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
     1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
     2. 通过加入以下内容，丰富图注细节：
      - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
     3. 保持真实性与准确性：
      - 不要使用笼统的描述
      - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
 
           prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
 
           prompt_template_encode_start_idx (default: 34)
 
diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py
index 01d984a584..7bb2c87e81 100644
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -297,4 +297,4 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    check_auto_docstrings(args.path, args.fix_and_overwrite)
\ No newline at end of file
+    check_auto_docstrings(args.path, args.fix_and_overwrite)