add modular_auto_docstring!

2026-01-27 17:22:53 +03:00 · 2026-01-10 11:55:03 +01:00
parent 34a743e2dc
commit ff09bf1a63
2 changed files with 1104 additions and 6 deletions
--- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
+++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py
@@ -58,8 +58,59 @@ logger = logging.get_logger(__name__)
 # 1. TEXT ENCODER
 # ====================

-
+#auto_docstring
 class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageAutoTextEncoderStep
+
+      Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
+
+      Components:
+
+          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=]
+
+          tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+      Configs:
+
+          prompt_template_encode (default: <|im_start|>system
+    Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
+    <|im_start|>user
+    {}<|im_end|>
+    <|im_start|>assistant
+    )
+
+          prompt_template_encode_start_idx (default: 34)
+
+          tokenizer_max_length (default: 1024)
+
+      Inputs:
+
+          prompt (`str`, *optional*):
+              The prompt or prompts to guide image generation.
+
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+
+          max_sequence_length (`int`, *optional*, defaults to 1024):
+              Maximum sequence length for prompt encoding.
+
+      Outputs:
+
+          prompt_embeds (`Tensor`):
+              The prompt embeddings
+
+          prompt_embeds_mask (`Tensor`):
+              The encoder attention mask
+
+          negative_prompt_embeds (`Tensor`):
+              The negative prompt embeddings
+
+          negative_prompt_embeds_mask (`Tensor`):
+              The negative prompt embeddings mask
+    """
    model_name = "qwenimage"
    block_classes = [QwenImageTextEncoderStep()]
    block_names = ["text_encoder"]
@@ -76,8 +127,54 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
 # 2. VAE ENCODER
 # ====================

-
+#auto_docstring
 class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintVaeEncoderStep
+
+      This step is used for processing image and mask inputs for inpainting tasks. It:
+       - Resizes the image to the target size, based on `height` and `width`.
+       - Processes and updates `image` and `mask_image`.
+       - Creates `image_latents`.
+
+      Components:
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          mask_image (`Image`):
+              Mask image for inpainting.
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          padding_mask_crop (`int`, *optional*):
+              Padding for mask cropping in inpainting.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          processed_mask_image (`None`):
+
+          mask_overlay_kwargs (`Dict`):
+              The kwargs for the postprocess step to apply the mask overlay
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
    model_name = "qwenimage"
    block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()]
    block_names = ["preprocess", "encode"]
@@ -92,7 +189,40 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
        )


+#auto_docstring
 class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgVaeEncoderStep
+
+      Vae encoder step that preprocess andencode the image inputs into their latent representations.
+
+      Components:
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+      Inputs:
+
+          image (`Image`):
+              Input image for img2img, editing, or conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          processed_image (`None`):
+
+          image_latents (`Tensor`):
+              The latents representing the reference image(s). Single tensor or list depending on input.
+    """
    model_name = "qwenimage"

    block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()]
@@ -103,7 +233,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
        return "Vae encoder step that preprocess andencode the image inputs into their latent representations."


-# Auto VAE encoder
 class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
    block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
    block_names = ["inpaint", "img2img"]
@@ -121,7 +250,43 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):


 # optional controlnet vae encoder
+#auto_docstring
 class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
+    """
+    class QwenImageOptionalControlNetVaeEncoderStep
+
+      Vae encoder step that encode the image inputs into their latent representations.
+      This is an auto pipeline block.
+       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
+       - if `control_image` is not provided, step will be skipped.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          control_image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          control_image (`Image`, *optional*):
+              Control image for ControlNet conditioning.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+      Outputs:
+
+          control_image_latents (`Tensor`):
+              The latents representing the control image
+    """
    block_classes = [QwenImageControlNetVaeEncoderStep]
    block_names = ["controlnet"]
    block_trigger_inputs = ["control_image"]
@@ -142,7 +307,52 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):


 # assemble input steps
+#auto_docstring
 class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgInputStep
+
+      Input step that prepares the inputs for the img2img denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
    model_name = "qwenimage"
    block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])]
    block_names = ["text_inputs", "additional_inputs"]
@@ -154,7 +364,54 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
        " - update height/width based `image_latents`, patchify `image_latents`."


+#auto_docstring
 class QwenImageInpaintInputStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintInputStep
+
+      Input step that prepares the inputs for the inpainting denoising step. It:
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+      Outputs:
+
+          batch_size (`int`):
+              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
+
+          dtype (`dtype`):
+              Data type of model tensor inputs (determined by `prompt_embeds`)
+
+          image_height (`int`):
+              The image height calculated from the image latents dimension
+
+          image_width (`int`):
+              The image width calculated from the image latents dimension
+    """
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -172,7 +429,49 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):


 # assemble prepare latents steps
+#auto_docstring
 class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintPrepareLatentsStep
+
+      This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
+       - Add noise to the image latents to create the latents input for the denoiser.
+       - Create the pachified latents `mask` based on the processedmask image.
+
+      Components:
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The initial random noised, can be generated in prepare latent step.
+
+          image_latents (`Tensor`):
+              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
+
+          timesteps (`Tensor`):
+              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
+
+          processed_mask_image (`Tensor`):
+              The processed mask to use for the inpainting process.
+
+          height (`None`):
+
+          width (`None`):
+
+          dtype (`None`):
+
+      Outputs:
+
+          initial_noise (`Tensor`):
+              The initial random noised used for inpainting denoising.
+
+          mask (`Tensor`):
+              The mask to use for the inpainting process.
+    """
    model_name = "qwenimage"
    block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()]
    block_names = ["add_noise_to_latents", "create_mask_latents"]
@@ -190,7 +489,66 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):


 # Qwen Image (text2image)
+#auto_docstring
 class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -212,10 +570,81 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]

 # Qwen Image (inpainting)
+#auto_docstring
 class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -240,9 +669,78 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]

 # Qwen Image (image2image)
+#auto_docstring
 class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+          **denoiser_input_fields (`Tensor`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -267,9 +765,87 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]

 # Qwen Image (text2image) with controlnet
+#auto_docstring
 class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetCoreDenoiseStep
+
+      step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageTextInputsStep(),
@@ -295,10 +871,95 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def description(self):
        return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)."
-
+ 
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]

 # Qwen Image (inpainting) with controlnet
+#auto_docstring
 class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetInpaintCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          processed_mask_image (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageInpaintInputStep(),
@@ -327,9 +988,93 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."

+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]
+

 # Qwen Image (image2image) with controlnet
+#auto_docstring
 class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    class QwenImageControlNetImg2ImgCoreDenoiseStep
+
+      Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
+
+      Components:
+
+          pachifier (`QwenImagePachifier`) [subfolder=]
+
+          scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
+
+          controlnet (`QwenImageControlNetModel`) [subfolder=]
+
+          guider (`ClassifierFreeGuidance`) [subfolder=]
+
+          transformer (`QwenImageTransformer2DModel`) [subfolder=]
+
+      Inputs:
+
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+
+          prompt_embeds (`None`):
+
+          prompt_embeds_mask (`None`):
+
+          negative_prompt_embeds (`None`, *optional*):
+
+          negative_prompt_embeds_mask (`None`, *optional*):
+
+          height (`int`, *optional*):
+              The height in pixels of the generated image.
+
+          width (`int`, *optional*):
+              The width in pixels of the generated image.
+
+          image_latents (`None`, *optional*):
+
+          control_image_latents (`None`):
+
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+
+          sigmas (`List`, *optional*):
+              Custom sigmas for the denoising process.
+
+          strength (`float`, *optional*, defaults to 0.9):
+              Strength for img2img/inpainting.
+
+          control_guidance_start (`float`, *optional*, defaults to 0.0):
+              When to start applying ControlNet.
+
+          control_guidance_end (`float`, *optional*, defaults to 1.0):
+              When to stop applying ControlNet.
+
+          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
+              Scale for ControlNet conditioning.
+
+          **denoiser_input_fields (`None`, *optional*):
+              All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds,
+              txt_seq_lens/negative_txt_seq_lens.
+
+          attention_kwargs (`Dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+
+          latents (`Tensor`):
+              Denoised latents.
+    """
+
    model_name = "qwenimage"
    block_classes = [
        QwenImageImg2ImgInputStep(),
@@ -357,7 +1102,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
    @property
    def description(self):
        return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
-
+    
+    @property
+    def outputs(self):
+        return [
+            OutputParam.latents(),
+        ]

 # Auto denoise step for QwenImage
 class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):
@@ -426,7 +1176,32 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks):


 # standard decode step works for most tasks except for inpaint
+#auto_docstring
 class QwenImageDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_processor (`VaeImageProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
@@ -437,7 +1212,34 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):


 # Inpaint decode step
+#auto_docstring
 class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
+    """
+    class QwenImageInpaintDecodeStep
+
+      Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
+
+      Components:
+
+          vae (`AutoencoderKLQwenImage`) [subfolder=]
+
+          image_mask_processor (`InpaintProcessor`) [subfolder=]
+
+      Inputs:
+
+          latents (`Tensor`):
+              The latents to decode, can be generated in the denoise step
+
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt''.
+
+          mask_overlay_kwargs (`None`, *optional*):
+
+      Outputs:
+
+          images (`List`):
+              Generated images.
+    """
    model_name = "qwenimage"
    block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()]
    block_names = ["decode", "postprocess"]
--- a/utils/modular_auto_docstring.py
+++ b/utils/modular_auto_docstring.py
@@ -0,0 +1,296 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Auto Docstring Generator for Modular Pipeline Blocks
+
+This script scans Python files for classes that have `# auto_docstring` comment above them
+and inserts/updates the docstring from the class's `doc` property.
+
+Run from the root of the repo:
+    python utils/modular_auto_docstring.py [path] [--fix_and_overwrite]
+
+Examples:
+    # Check for auto_docstring markers (will error if found without proper docstring)
+    python utils/modular_auto_docstring.py
+
+    # Check specific directory
+    python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/
+
+    # Fix and overwrite the docstrings
+    python utils/modular_auto_docstring.py --fix_and_overwrite
+
+Usage in code:
+    # auto_docstring
+    class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
+        # docstring will be automatically inserted here
+        
+        @property
+        def doc(self):
+            return "Your docstring content..."
+"""
+
+import argparse
+import ast
+import glob
+import importlib
+import os
+import re
+import sys
+
+
+# All paths are set with the intent you should run this script from the root of the repo
+DIFFUSERS_PATH = "src/diffusers"
+REPO_PATH = "."
+
+# Pattern to match the auto_docstring comment
+AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$")
+
+
+def setup_diffusers_import():
+    """Setup import path to use the local diffusers module."""
+    src_path = os.path.join(REPO_PATH, "src")
+    if src_path not in sys.path:
+        sys.path.insert(0, src_path)
+
+
+def get_module_from_filepath(filepath: str) -> str:
+    """Convert a filepath to a module name."""
+    filepath = os.path.normpath(filepath)
+    
+    if filepath.startswith("src" + os.sep):
+        filepath = filepath[4:]
+    
+    if filepath.endswith(".py"):
+        filepath = filepath[:-3]
+    
+    module_name = filepath.replace(os.sep, ".")
+    return module_name
+
+
+def load_module(filepath: str):
+    """Load a module from filepath."""
+    setup_diffusers_import()
+    module_name = get_module_from_filepath(filepath)
+    
+    try:
+        module = importlib.import_module(module_name)
+        return module
+    except Exception as e:
+        print(f"Warning: Could not import module {module_name}: {e}")
+        return None
+
+
+def get_doc_from_class(module, class_name: str) -> str:
+    """Get the doc property from an instantiated class."""
+    if module is None:
+        return None
+    
+    cls = getattr(module, class_name, None)
+    if cls is None:
+        return None
+    
+    try:
+        instance = cls()
+        if hasattr(instance, "doc"):
+            return instance.doc
+    except Exception as e:
+        print(f"Warning: Could not instantiate {class_name}: {e}")
+    
+    return None
+
+
+def find_auto_docstring_classes(filepath: str) -> list:
+    """
+    Find all classes in a file that have # auto_docstring comment above them.
+    
+    Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line)
+    """
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Parse AST to find class locations and their docstrings
+    content = "".join(lines)
+    try:
+        tree = ast.parse(content)
+    except SyntaxError as e:
+        print(f"Syntax error in {filepath}: {e}")
+        return []
+    
+    # Build a map of class_name -> (class_line, has_docstring, docstring_end_line)
+    class_info = {}
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ClassDef):
+            has_docstring = False
+            docstring_end_line = node.lineno  # default to class line
+            
+            if node.body and isinstance(node.body[0], ast.Expr):
+                first_stmt = node.body[0]
+                if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str):
+                    has_docstring = True
+                    docstring_end_line = first_stmt.end_lineno or first_stmt.lineno
+            
+            class_info[node.name] = (node.lineno, has_docstring, docstring_end_line)
+    
+    # Now scan for # auto_docstring comments
+    classes_to_update = []
+    
+    for i, line in enumerate(lines):
+        if AUTO_DOCSTRING_PATTERN.match(line):
+            # Found the marker, look for class definition on next non-empty, non-comment line
+            j = i + 1
+            while j < len(lines):
+                next_line = lines[j].strip()
+                if next_line and not next_line.startswith("#"):
+                    break
+                j += 1
+            
+            if j < len(lines) and lines[j].strip().startswith("class "):
+                # Extract class name
+                match = re.match(r"class\s+(\w+)", lines[j].strip())
+                if match:
+                    class_name = match.group(1)
+                    if class_name in class_info:
+                        class_line, has_docstring, docstring_end_line = class_info[class_name]
+                        classes_to_update.append((
+                            class_name,
+                            class_line,
+                            has_docstring,
+                            docstring_end_line
+                        ))
+    
+    return classes_to_update
+
+
+def format_docstring(doc: str, indent: str = "    ") -> str:
+    """Format a doc string as a properly indented docstring."""
+    lines = doc.strip().split("\n")
+    
+    if len(lines) == 1:
+        return f'{indent}"""{lines[0]}"""\n'
+    else:
+        result = [f'{indent}"""\n']
+        for line in lines:
+            if line.strip():
+                result.append(f"{indent}{line}\n")
+            else:
+                result.append("\n")
+        result.append(f'{indent}"""\n')
+        return "".join(result)
+
+
+def process_file(filepath: str, overwrite: bool = False) -> list:
+    """
+    Process a file and find/insert docstrings for # auto_docstring marked classes.
+    
+    Returns list of classes that need updating.
+    """
+    classes_to_update = find_auto_docstring_classes(filepath)
+    
+    if not classes_to_update:
+        return []
+    
+    if not overwrite:
+        # Just return the list of classes that need updating
+        return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+    
+    # Load the module to get doc properties
+    module = load_module(filepath)
+    
+    with open(filepath, "r", encoding="utf-8", newline="\n") as f:
+        lines = f.readlines()
+    
+    # Process in reverse order to maintain line numbers
+    updated = False
+    for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update):
+        doc = get_doc_from_class(module, class_name)
+        
+        if doc is None:
+            print(f"Warning: Could not get doc for {class_name} in {filepath}")
+            continue
+        
+        # Format the new docstring with 4-space indent
+        new_docstring = format_docstring(doc, "    ")
+        
+        if has_docstring:
+            # Replace existing docstring (line after class definition to docstring_end_line)
+            # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line
+            lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:]
+        else:
+            # Insert new docstring right after class definition line
+            # class_line is 1-indexed, so lines[class_line-1] is the class line
+            # Insert at position class_line (which is right after the class line)
+            lines = lines[:class_line] + [new_docstring] + lines[class_line:]
+        
+        updated = True
+        print(f"Updated docstring for {class_name} in {filepath}")
+    
+    if updated:
+        with open(filepath, "w", encoding="utf-8", newline="\n") as f:
+            f.writelines(lines)
+    
+    return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update]
+
+
+def check_auto_docstrings(path: str = None, overwrite: bool = False):
+    """
+    Check all files for # auto_docstring markers and optionally fix them.
+    """
+    if path is None:
+        path = DIFFUSERS_PATH
+    
+    if os.path.isfile(path):
+        all_files = [path]
+    else:
+        all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True)
+    
+    all_markers = []
+    
+    for filepath in all_files:
+        markers = process_file(filepath, overwrite)
+        all_markers.extend(markers)
+    
+    if not overwrite and len(all_markers) > 0:
+        message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers])
+        raise ValueError(
+            f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n"
+            f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them."
+        )
+    
+    if overwrite and len(all_markers) > 0:
+        print(f"\nUpdated {len(all_markers)} docstring(s).")
+    elif len(all_markers) == 0:
+        print("No # auto_docstring markers found.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Check and fix # auto_docstring markers in modular pipeline blocks",
+    )
+    parser.add_argument(
+        "path",
+        nargs="?",
+        default=None,
+        help="File or directory to process (default: src/diffusers)"
+    )
+    parser.add_argument(
+        "--fix_and_overwrite",
+        action="store_true",
+        help="Whether to fix the docstrings by inserting them from doc property.",
+    )
+    
+    args = parser.parse_args()
+    
+    check_auto_docstrings(args.path, args.fix_and_overwrite)