add TODO in the description for empty docstring

2026-01-27 17:22:53 +03:00 · 2026-01-17 09:57:56 +01:00
parent aea0d046f6
commit 25c968a38f
7 changed files with 184 additions and 86 deletions
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -708,6 +708,8 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
            desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description)
            wrapped_desc = wrap_text(desc, desc_indent, max_line_length)
            param_str += f"\n{desc_indent}{wrapped_desc}"
+        else:
+            param_str += f"\n{desc_indent}TODO: Add description."

        formatted_params.append(param_str)

--- a/src/diffusers/modular_pipelines/qwenimage/encoders.py
+++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -1324,7 +1324,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
    @property
    def inputs(self) -> List[InputParam]:
        return [
-            InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True),
+            InputParam.template(self._image_input_name)
+            or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"),
            InputParam.generator(),
        ]

--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 34)

@@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):

      Outputs:
          processed_image (`None`):
+              TODO: Add description.
          processed_mask_image (`None`):
+              TODO: Add description.
          mask_overlay_kwargs (`Dict`):
              The kwargs for the postprocess step to apply the mask overlay
          image_latents (`Tensor`):
@@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):

      Outputs:
          processed_image (`None`):
+              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`None`):
+              TODO: Add description.
          width (`None`):
+              TODO: Add description.
          dtype (`None`):
+              TODO: Add description.

      Outputs:
          initial_noise (`Tensor`):
@@ -425,7 +440,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).

      Components:

@@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
@@ -499,7 +519,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.

      Components:

@@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -579,7 +606,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.

      Components:

@@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -658,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
+    (timesteps, latents, rope inputs etc.).

      Components:

@@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          control_image_latents (`None`):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
@@ -746,7 +785,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
+    task.

      Components:

@@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.
          control_image_latents (`None`):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -840,7 +887,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
+    task.

      Components:

@@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          control_image_latents (`None`):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -1031,7 +1085,8 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
    """
-    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
+    overally to the original image.

      Components:

@@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
          mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          images (`List`):
@@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 34)

@@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`):
@@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_image_latents (`None`, *optional*):
+              TODO: Add description.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
          mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          images (`List`):
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 64)

@@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
          resized_image (`List`):
              The resized images
          processed_image (`None`):
+              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
          resized_image (`List`):
              The resized images
          processed_image (`None`):
+              TODO: Add description.
          processed_mask_image (`None`):
+              TODO: Add description.
          mask_overlay_kwargs (`Dict`):
              The kwargs for the postprocess step to apply the mask overlay
          image_latents (`Tensor`):
@@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
+              step.
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`None`):
+              TODO: Add description.
          width (`None`):
+              TODO: Add description.
          dtype (`None`):
+              TODO: Add description.

      Outputs:
          initial_noise (`Tensor`):
@@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -622,7 +650,8 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
    """
-    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
+    overlay to the original image.

      Components:

@@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
          mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          images (`List`):
@@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    """
    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
      - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
+        `padding_mask_crop`

      Components:

@@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 64)

@@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
          width (`int`):
              The width in pixels of the generated image.
          image_latents (`None`):
+              TODO: Add description.
          processed_mask_image (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`):
@@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
          mask_overlay_kwargs (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          images (`List`):
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )

          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)

@@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
          resized_image (`List`):
              The resized images
          processed_image (`None`):
+              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
@@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
+    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
+    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
+    {}<|im_end|> <|im_start|>assistant )

          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)

--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,7 +53,8 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
    """
-    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
+    provided.

      Components:

@@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
      Configs:

          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
    1. Write the caption using natural, descriptive language without structured formats or rich text.
    2. Enrich caption details by including:
     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
    3. Maintain authenticity and accuracy:
     - Avoid generalizations
     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )

          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
    2. 通过加入以下内容，丰富图注细节：
     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
    3. 保持真实性与准确性：
     - 不要使用笼统的描述
     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 34)

@@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
          resized_image (`List`):
              The resized images
          processed_image (`None`):
+              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          image_latents (`None`, *optional*):
+              TODO: Add description.

      Outputs:
          batch_size (`int`):
@@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`None`):
+              TODO: Add description.
          prompt_embeds_mask (`None`):
+              TODO: Add description.
          negative_prompt_embeds (`None`, *optional*):
+              TODO: Add description.
          negative_prompt_embeds_mask (`None`, *optional*):
+              TODO: Add description.
          image_latents (`None`, *optional*):
+              TODO: Add description.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          layers (`int`, *optional*, defaults to 4):
@@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
      Configs:

          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # Image Annotator
-    You are a professional image annotator. Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
+    Please write an image caption based on the input image:
    1. Write the caption using natural, descriptive language without structured formats or rich text.
    2. Enrich caption details by including:
     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
+       attachment relations, action relations, comparative relations, causal relations, and so on
     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
+       caption with quotation marks
    3. Maintain authenticity and accuracy:
     - Avoid generalizations
     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )

          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    # 图像标注器
-    你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
    2. 通过加入以下内容，丰富图注细节：
     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    3. 保持真实性与准确性：
     - 不要使用笼统的描述
     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
-    <|im_start|>assistant
-    )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
-    <|im_start|>user
-    {}<|im_end|>
-    <|im_start|>assistant
-    )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
+    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )

          prompt_template_encode_start_idx (default: 34)

--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -297,4 +297,4 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    check_auto_docstrings(args.path, args.fix_and_overwrite)
+    check_auto_docstrings(args.path, args.fix_and_overwrite)