address feedbacks

2026-01-27 17:22:53 +03:00 · 2026-01-17 09:36:58 +01:00
parent 1c90ce33f2
commit aea0d046f6
6 changed files with 271 additions and 750 deletions
--- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py
@@ -711,7 +711,7 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):

        formatted_params.append(param_str)

-    return "\n\n".join(formatted_params)
+    return "\n".join(formatted_params)


 def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
        loading_field_values = []
        for field_name in component.loading_fields():
            field_value = getattr(component, field_name)
-            if field_value is not None:
+            if field_value:
                loading_field_values.append(f"{field_name}={field_value}")

        # Add loading field information if available
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py
@@ -59,55 +59,46 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditVLEncoderStep
-
-      QwenImage-Edit VL encoder step that encode the image and text prompts together.
+    QwenImage-Edit VL encoder step that encode the image and text prompts together.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode_start_idx (default: 64)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          prompt (`str`):
              The prompt or prompts to guide image generation.
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          prompt_embeds (`Tensor`):
              The prompt embeddings
-
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask
-
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings
-
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask
    """
@@ -133,33 +124,26 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          processed_image (`None`):
-
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -181,47 +165,36 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInpaintVaeEncoderStep
-
-      This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
+    This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
       - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
       - process the resized image and mask image.
       - create image latents.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          mask_image (`Image`):
              Mask image for inpainting.
-
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          processed_image (`None`):
-
          processed_mask_image (`None`):
-
          mask_overlay_kwargs (`Dict`):
              The kwargs for the postprocess step to apply the mask overlay
-
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -270,48 +243,34 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
 # auto_docstring
 class QwenImageEditInputStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInputStep
-
-      Input step that prepares the inputs for the edit denoising step. It:
+    Input step that prepares the inputs for the edit denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):

      Outputs:
-
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
-
          image_height (`int`):
              The image height calculated from the image latents dimension
-
          image_width (`int`):
              The image width calculated from the image latents dimension
    """
@@ -335,50 +294,35 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInpaintInputStep
-
-      Input step that prepares the inputs for the edit inpaint denoising step. It:
+    Input step that prepares the inputs for the edit inpaint denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):
-
          processed_mask_image (`None`, *optional*):

      Outputs:
-
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
-
          image_height (`int`):
              The image height calculated from the image latents dimension
-
          image_width (`int`):
              The image width calculated from the image latents dimension
    """
@@ -405,44 +349,32 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInpaintPrepareLatentsStep
-
-      This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
+    This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
       - Add noise to the image latents to create the latents input for the denoiser.
       - Create the patchified latents `mask` based on the processed mask image.

      Components:

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

      Inputs:
-
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
-
          image_latents (`Tensor`):
-              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
-              step.
-
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
-
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
-
          height (`None`):
-
          width (`None`):
-
          dtype (`None`):

      Outputs:
-
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
-
          mask (`Tensor`):
              The mask to use for the inpainting process.
    """
@@ -464,61 +396,44 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit edit (img2img) task.

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
-
          latents (`Tensor`):
              Denoised latents.
    """
@@ -556,66 +471,47 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInpaintCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit edit inpaint task.
+    Core denoising workflow for QwenImage-Edit edit inpaint task.

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):
-
          processed_mask_image (`None`, *optional*):
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
-
          latents (`Tensor`):
              Denoised latents.
    """
@@ -694,26 +590,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
 # auto_docstring
 class QwenImageEditDecodeStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image.
+    Decode step that decodes the latents to images and postprocess the generated image.

      Components:

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

      Inputs:
-
          latents (`Tensor`):
              The latents to decode, can be generated in the denoise step
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.

      Outputs:
-
          images (`List`):
              Generated images.
    """
@@ -731,29 +622,22 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditInpaintDecodeStep
-
-      Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
-      overlay to the original image.
+    Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.

      Components:

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)

      Inputs:
-
          latents (`Tensor`):
              The latents to decode, can be generated in the denoise step
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
-
          mask_overlay_kwargs (`None`, *optional*):

      Outputs:
-
          images (`List`):
              Generated images.
    """
@@ -806,103 +690,81 @@ EDIT_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
    """
-    class QwenImageEditAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
+    Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
      - for edit (img2img) generation, you need to provide `image`
-      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
-        `padding_mask_crop`
+      - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          image_mask_processor (`InpaintProcessor`) [subfolder=]
+          image_mask_processor (`InpaintProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode_start_idx (default: 64)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          prompt (`str`):
              The prompt or prompts to guide image generation.
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
-
          mask_image (`Image`, *optional*):
              Mask image for inpainting.
-
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          height (`int`):
              The height in pixels of the generated image.
-
          width (`int`):
              The width in pixels of the generated image.
-
          image_latents (`None`):
-
          processed_mask_image (`None`, *optional*):
-
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
-
          num_inference_steps (`int`):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.
-
          mask_overlay_kwargs (`None`, *optional*):

      Outputs:
-
          images (`List`):
              Generated images.
    """
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py
@@ -52,57 +52,48 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusVLEncoderStep
-
-      QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
+    QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )

          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)

          prompt_template_encode_start_idx (default: 64)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          prompt (`str`):
              The prompt or prompts to guide image generation.
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.

      Outputs:
-
          resized_cond_image (`List`):
              The resized images
-
          prompt_embeds (`Tensor`):
              The prompt embeddings
-
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask
-
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings
-
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask
    """
@@ -127,34 +118,27 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusVaeEncoderStep
-
-      VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
-      on its own aspect ratio to 1024x1024 target area.
+    VAE encoder step that encodes image inputs into latent representations.
+      Each image is resized independently based on its own aspect ratio to 1024x1024 target area.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          processed_image (`None`):
-
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -184,9 +168,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusInputStep
-
-      Input step that prepares the inputs for the Edit Plus denoising step. It:
+    Input step that prepares the inputs for the Edit Plus denoising step. It:
       - Standardizes text embeddings batch size.
       - Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
       - Outputs lists of image_height/image_width for RoPE calculation.
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):

      Outputs:
-
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
-
          image_height (`List`):
              The image heights calculated from the image latents dimension
-
          image_width (`List`):
              The image widths calculated from the image latents dimension
    """
@@ -254,61 +224,44 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
+    Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.

      Components:

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          image_latents (`None`, *optional*):
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
-
          latents (`Tensor`):
              Denoised latents.
    """
@@ -350,26 +303,21 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusDecodeStep
-
-      Decode step that decodes the latents to images and postprocesses the generated image.
+    Decode step that decodes the latents to images and postprocesses the generated image.

      Components:

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

      Inputs:
-
          latents (`Tensor`):
              The latents to decode, can be generated in the denoise step
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.

      Outputs:
-
          images (`List`):
              Generated images.
    """
@@ -400,88 +348,73 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
    """
-    class QwenImageEditPlusAutoBlocks
-
-      Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
+    Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
      - `image` is required input (can be single image or list of images).
      - Each image is resized independently based on its own aspect ratio.
      - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          pachifier (`QwenImagePachifier`) [subfolder=]
+          pachifier (`QwenImagePachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Configs:

          prompt_template_encode (default: <|im_start|>system
-    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
-    the user's text instruction should alter or modify the image. Generate a new image that meets the user's
-    requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
-    {}<|im_end|> <|im_start|>assistant )
+    Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )

          img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)

          prompt_template_encode_start_idx (default: 64)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          prompt (`str`):
              The prompt or prompts to guide image generation.
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          height (`int`, *optional*):
              The height in pixels of the generated image.
-
          width (`int`, *optional*):
              The width in pixels of the generated image.
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.

      Outputs:
-
          images (`List`):
              Generated images.
    """
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py
@@ -53,43 +53,45 @@ logger = logging.get_logger(__name__)
 # auto_docstring
 class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageLayeredTextEncoderStep
-
-      QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
-      provided.
+    QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

      Configs:

          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
    1. Write the caption using natural, descriptive language without structured formats or rich text.
    2. Enrich caption details by including:
     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
    3. Maintain authenticity and accuracy:
     - Avoid generalizations
     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )

          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
    2. 通过加入以下内容，丰富图注细节：
     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
    3. 保持真实性与准确性：
     - 不要使用笼统的描述
     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode_start_idx (default: 34)

          tokenizer_max_length (default: 1024)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
-
          prompt (`str`, *optional*):
              The prompt to encode
-
          use_en_prompt (`bool`, *optional*, defaults to False):
              Whether to use English prompt template
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
-
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          prompt_embeds (`Tensor`):
              The prompt embeddings
-
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask
-
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings
-
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask
    """
@@ -169,36 +165,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
    """
-    class QwenImageLayeredVaeEncoderStep
-
-      Vae encoder step that encode the image inputs into their latent representations.
+    Vae encoder step that encode the image inputs into their latent representations.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
-
          resized_image (`List`):
              The resized images
-
          processed_image (`None`):
-
          image_latents (`Tensor`):
              The latents representing the reference image(s). Single tensor or list depending on input.
    """
@@ -226,48 +214,34 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredInputStep(SequentialPipelineBlocks):
    """
-    class QwenImageLayeredInputStep
-
-      Input step that prepares the inputs for the layered denoising step. It:
+    Input step that prepares the inputs for the layered denoising step. It:
       - make sure the text embeddings have consistent batch size as well as the additional inputs.
       - update height/width based `image_latents`, patchify `image_latents`.

      Components:

-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          image_latents (`None`, *optional*):

      Outputs:
-
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
-
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
-
          image_height (`int`):
              The image height calculated from the image latents dimension
-
          image_width (`int`):
              The image width calculated from the image latents dimension
-
          height (`int`):
              The height of the image output
-
          width (`int`):
              The width of the image output
    """
@@ -292,58 +266,42 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
 # auto_docstring
 class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
    """
-    class QwenImageLayeredCoreDenoiseStep
-
-      Core denoising workflow for QwenImage-Layered img2img task.
+    Core denoising workflow for QwenImage-Layered img2img task.

      Components:

-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Inputs:
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          prompt_embeds (`None`):
-
          prompt_embeds_mask (`None`):
-
          negative_prompt_embeds (`None`, *optional*):
-
          negative_prompt_embeds_mask (`None`, *optional*):
-
          image_latents (`None`, *optional*):
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
-
          latents (`Tensor`):
              Denoised latents.
    """
@@ -394,52 +352,55 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
 # auto_docstring
 class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    """
-    class QwenImageLayeredAutoBlocks
-
-      Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
+    Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.

      Components:

-          image_resize_processor (`VaeImageProcessor`) [subfolder=]
+          image_resize_processor (`VaeImageProcessor`)

-          text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`)

-          processor (`Qwen2VLProcessor`) [subfolder=]
+          processor (`Qwen2VLProcessor`)

-          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use

-          guider (`ClassifierFreeGuidance`) [subfolder=]
+          guider (`ClassifierFreeGuidance`)

-          image_processor (`VaeImageProcessor`) [subfolder=]
+          image_processor (`VaeImageProcessor`)

-          vae (`AutoencoderKLQwenImage`) [subfolder=]
+          vae (`AutoencoderKLQwenImage`)

-          pachifier (`QwenImageLayeredPachifier`) [subfolder=]
+          pachifier (`QwenImageLayeredPachifier`)

-          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+          scheduler (`FlowMatchEulerDiscreteScheduler`)

-          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+          transformer (`QwenImageTransformer2DModel`)

      Configs:

          image_caption_prompt_en (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
-    Please write an image caption based on the input image:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # Image Annotator
+    You are a professional image annotator. Please write an image caption based on the input image:
    1. Write the caption using natural, descriptive language without structured formats or rich text.
    2. Enrich caption details by including:
     - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
-     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
-       attachment relations, action relations, comparative relations, causal relations, and so on
+     - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
     - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
-     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
-       caption with quotation marks
+     - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
    3. Maintain authenticity and accuracy:
     - Avoid generalizations
     - Describe all visible information in the image, while do not add information not explicitly shown in the image
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )

          image_caption_prompt_cn (default: <|im_start|>system
-    You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像，撰写图注:
+    You are a helpful assistant.<|im_end|>
+    <|im_start|>user
+    # 图像标注器
+    你是一个专业的图像标注器。请基于输入图像，撰写图注:
    1. 使用自然、描述性的语言撰写图注，不要使用结构化形式或富文本形式。
    2. 通过加入以下内容，丰富图注细节：
     - 对象的属性：如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
    3. 保持真实性与准确性：
     - 不要使用笼统的描述
     - 描述图像中所有可见的信息，但不要加入没有在图像中出现的内容
-    <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
+    <|vision_start|><|image_pad|><|vision_end|><|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode (default: <|im_start|>system
-    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
-    objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )

          prompt_template_encode_start_idx (default: 34)

          tokenizer_max_length (default: 1024)

      Inputs:
-
          image (`Image`):
              Input image for img2img, editing, or conditioning.
-
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640
-
          prompt (`str`, *optional*):
              The prompt to encode
-
          use_en_prompt (`bool`, *optional*, defaults to False):
              Whether to use English prompt template
-
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
-
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
-
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
-
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
-
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
-
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
-
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
-
          sigmas (`List`, *optional*):
              Custom sigmas for the denoising process.
-
          attention_kwargs (`Dict`, *optional*):
              Additional kwargs for attention processors.
-
          **denoiser_input_fields (`Tensor`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
-
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt''.

      Outputs:
-
          images (`List`):
              Generated images.
    """
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
    return classes_to_update


+def strip_class_name_line(doc: str, class_name: str) -> str:
+    """Remove the 'class ClassName' line from the doc if present."""
+    lines = doc.strip().split("\n")
+    if lines and lines[0].strip() == f"class {class_name}":
+        # Remove the class line and any blank line following it
+        lines = lines[1:]
+        while lines and not lines[0].strip():
+            lines = lines[1:]
+    return "\n".join(lines)
+
+
 def format_docstring(doc: str, indent: str = "    ") -> str:
    """Format a doc string as a properly indented docstring."""
    lines = doc.strip().split("\n")
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
            print(f"Warning: Could not get doc for {class_name} in {filepath}")
            continue

+        # Remove the "class ClassName" line since it's redundant in a docstring
+        doc = strip_class_name_line(doc, class_name)
+
        # Format the new docstring with 4-space indent
        new_docstring = format_docstring(doc, "    ")

@@ -283,4 +297,4 @@ if __name__ == "__main__":

    args = parser.parse_args()

-    check_auto_docstrings(args.path, args.fix_and_overwrite)
+    check_auto_docstrings(args.path, args.fix_and_overwrite)