From ff09bf1a631e38683205217e8dba4961de090319 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 11:55:03 +0100 Subject: [PATCH] add modular_auto_docstring! --- .../qwenimage/modular_blocks_qwenimage.py | 814 +++++++++++++++++- utils/modular_auto_docstring.py | 296 +++++++ 2 files changed, 1104 insertions(+), 6 deletions(-) create mode 100644 utils/modular_auto_docstring.py diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index d6117a12a5..19feffe77e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -58,8 +58,59 @@ logger = logging.get_logger(__name__) # 1. TEXT ENCODER # ==================== - +#auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): + """ + class QwenImageAutoTextEncoderStep + + Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. + + Components: + + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + + tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + Configs: + + prompt_template_encode (default: <|im_start|>system + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) + + prompt_template_encode_start_idx (default: 34) + + tokenizer_max_length (default: 1024) + + Inputs: + + prompt (`str`, *optional*): + The prompt or prompts to guide image generation. + + negative_prompt (`str`, *optional*): + The prompt or prompts not to guide the image generation. + + max_sequence_length (`int`, *optional*, defaults to 1024): + Maximum sequence length for prompt encoding. + + Outputs: + + prompt_embeds (`Tensor`): + The prompt embeddings + + prompt_embeds_mask (`Tensor`): + The encoder attention mask + + negative_prompt_embeds (`Tensor`): + The negative prompt embeddings + + negative_prompt_embeds_mask (`Tensor`): + The negative prompt embeddings mask + """ model_name = "qwenimage" block_classes = [QwenImageTextEncoderStep()] block_names = ["text_encoder"] @@ -76,8 +127,54 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): # 2. VAE ENCODER # ==================== - +#auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintVaeEncoderStep + + This step is used for processing image and mask inputs for inpainting tasks. It: + - Resizes the image to the target size, based on `height` and `width`. + - Processes and updates `image` and `mask_image`. + - Creates `image_latents`. + + Components: + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + mask_image (`Image`): + Mask image for inpainting. + + image (`Image`): + Input image for img2img, editing, or conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + padding_mask_crop (`int`, *optional*): + Padding for mask cropping in inpainting. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + processed_image (`None`): + + processed_mask_image (`None`): + + mask_overlay_kwargs (`Dict`): + The kwargs for the postprocess step to apply the mask overlay + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @@ -92,7 +189,40 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): ) +#auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgVaeEncoderStep + + Vae encoder step that preprocess andencode the image inputs into their latent representations. + + Components: + + image_processor (`VaeImageProcessor`) [subfolder=] + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + Inputs: + + image (`Image`): + Input image for img2img, editing, or conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + processed_image (`None`): + + image_latents (`Tensor`): + The latents representing the reference image(s). Single tensor or list depending on input. + """ model_name = "qwenimage" block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -103,7 +233,6 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): return "Vae encoder step that preprocess andencode the image inputs into their latent representations." -# Auto VAE encoder class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep] block_names = ["inpaint", "img2img"] @@ -121,7 +250,43 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): # optional controlnet vae encoder +#auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): + """ + class QwenImageOptionalControlNetVaeEncoderStep + + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block. + - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. + - if `control_image` is not provided, step will be skipped. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + control_image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + control_image (`Image`, *optional*): + Control image for ControlNet conditioning. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + Outputs: + + control_image_latents (`Tensor`): + The latents representing the control image + """ block_classes = [QwenImageControlNetVaeEncoderStep] block_names = ["controlnet"] block_trigger_inputs = ["control_image"] @@ -142,7 +307,52 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): # assemble input steps +#auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgInputStep + + Input step that prepares the inputs for the img2img denoising step. It: + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] block_names = ["text_inputs", "additional_inputs"] @@ -154,7 +364,54 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): " - update height/width based `image_latents`, patchify `image_latents`." +#auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintInputStep + + Input step that prepares the inputs for the inpainting denoising step. It: + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + Outputs: + + batch_size (`int`): + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt + + dtype (`dtype`): + Data type of model tensor inputs (determined by `prompt_embeds`) + + image_height (`int`): + The image height calculated from the image latents dimension + + image_width (`int`): + The image width calculated from the image latents dimension + """ model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -172,7 +429,49 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): # assemble prepare latents steps +#auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintPrepareLatentsStep + + This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: + - Add noise to the image latents to create the latents input for the denoiser. + - Create the pachified latents `mask` based on the processedmask image. + + Components: + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + pachifier (`QwenImagePachifier`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The initial random noised, can be generated in prepare latent step. + + image_latents (`Tensor`): + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + + timesteps (`Tensor`): + The timesteps to use for the denoising process. Can be generated in set_timesteps step. + + processed_mask_image (`Tensor`): + The processed mask to use for the inpainting process. + + height (`None`): + + width (`None`): + + dtype (`None`): + + Outputs: + + initial_noise (`Tensor`): + The initial random noised used for inpainting denoising. + + mask (`Tensor`): + The mask to use for the inpainting process. + """ model_name = "qwenimage" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -190,7 +489,66 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): # Qwen Image (text2image) +#auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageCoreDenoiseStep + + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -212,10 +570,81 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (inpainting) +#auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageInpaintInputStep(), @@ -240,9 +669,78 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (image2image) +#auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageImg2ImgCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + **denoiser_input_fields (`Tensor`, *optional*): + conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageImg2ImgInputStep(), @@ -267,9 +765,87 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (text2image) with controlnet +#auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetCoreDenoiseStep + + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + control_image_latents (`None`): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -295,10 +871,95 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Qwen Image (inpainting) with controlnet +#auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetInpaintCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + processed_mask_image (`None`, *optional*): + + control_image_latents (`None`): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageInpaintInputStep(), @@ -327,9 +988,93 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task." + @property + def outputs(self): + return [ + OutputParam.latents(), + ] + # Qwen Image (image2image) with controlnet +#auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): + """ + class QwenImageControlNetImg2ImgCoreDenoiseStep + + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + + Components: + + pachifier (`QwenImagePachifier`) [subfolder=] + + scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + + controlnet (`QwenImageControlNetModel`) [subfolder=] + + guider (`ClassifierFreeGuidance`) [subfolder=] + + transformer (`QwenImageTransformer2DModel`) [subfolder=] + + Inputs: + + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + + prompt_embeds (`None`): + + prompt_embeds_mask (`None`): + + negative_prompt_embeds (`None`, *optional*): + + negative_prompt_embeds_mask (`None`, *optional*): + + height (`int`, *optional*): + The height in pixels of the generated image. + + width (`int`, *optional*): + The width in pixels of the generated image. + + image_latents (`None`, *optional*): + + control_image_latents (`None`): + + latents (`Tensor`, *optional*): + Pre-generated noisy latents for image generation. + + generator (`Generator`, *optional*): + Torch generator for deterministic generation. + + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. + + sigmas (`List`, *optional*): + Custom sigmas for the denoising process. + + strength (`float`, *optional*, defaults to 0.9): + Strength for img2img/inpainting. + + control_guidance_start (`float`, *optional*, defaults to 0.0): + When to start applying ControlNet. + + control_guidance_end (`float`, *optional*, defaults to 1.0): + When to stop applying ControlNet. + + controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): + Scale for ControlNet conditioning. + + **denoiser_input_fields (`None`, *optional*): + All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, + txt_seq_lens/negative_txt_seq_lens. + + attention_kwargs (`Dict`, *optional*): + Additional kwargs for attention processors. + + Outputs: + + latents (`Tensor`): + Denoised latents. + """ + model_name = "qwenimage" block_classes = [ QwenImageImg2ImgInputStep(), @@ -357,7 +1102,12 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - + + @property + def outputs(self): + return [ + OutputParam.latents(), + ] # Auto denoise step for QwenImage class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): @@ -426,7 +1176,32 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): # standard decode step works for most tasks except for inpaint +#auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_processor (`VaeImageProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -437,7 +1212,34 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): # Inpaint decode step +#auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): + """ + class QwenImageInpaintDecodeStep + + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + + Components: + + vae (`AutoencoderKLQwenImage`) [subfolder=] + + image_mask_processor (`InpaintProcessor`) [subfolder=] + + Inputs: + + latents (`Tensor`): + The latents to decode, can be generated in the denoise step + + output_type (`str`, *optional*, defaults to pil): + Output format: 'pil', 'np', 'pt''. + + mask_overlay_kwargs (`None`, *optional*): + + Outputs: + + images (`List`): + Generated images. + """ model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py new file mode 100644 index 0000000000..c6aaf8a46a --- /dev/null +++ b/utils/modular_auto_docstring.py @@ -0,0 +1,296 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Auto Docstring Generator for Modular Pipeline Blocks + +This script scans Python files for classes that have `# auto_docstring` comment above them +and inserts/updates the docstring from the class's `doc` property. + +Run from the root of the repo: + python utils/modular_auto_docstring.py [path] [--fix_and_overwrite] + +Examples: + # Check for auto_docstring markers (will error if found without proper docstring) + python utils/modular_auto_docstring.py + + # Check specific directory + python utils/modular_auto_docstring.py src/diffusers/modular_pipelines/ + + # Fix and overwrite the docstrings + python utils/modular_auto_docstring.py --fix_and_overwrite + +Usage in code: + # auto_docstring + class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): + # docstring will be automatically inserted here + + @property + def doc(self): + return "Your docstring content..." +""" + +import argparse +import ast +import glob +import importlib +import os +import re +import sys + + +# All paths are set with the intent you should run this script from the root of the repo +DIFFUSERS_PATH = "src/diffusers" +REPO_PATH = "." + +# Pattern to match the auto_docstring comment +AUTO_DOCSTRING_PATTERN = re.compile(r"^\s*#\s*auto_docstring\s*$") + + +def setup_diffusers_import(): + """Setup import path to use the local diffusers module.""" + src_path = os.path.join(REPO_PATH, "src") + if src_path not in sys.path: + sys.path.insert(0, src_path) + + +def get_module_from_filepath(filepath: str) -> str: + """Convert a filepath to a module name.""" + filepath = os.path.normpath(filepath) + + if filepath.startswith("src" + os.sep): + filepath = filepath[4:] + + if filepath.endswith(".py"): + filepath = filepath[:-3] + + module_name = filepath.replace(os.sep, ".") + return module_name + + +def load_module(filepath: str): + """Load a module from filepath.""" + setup_diffusers_import() + module_name = get_module_from_filepath(filepath) + + try: + module = importlib.import_module(module_name) + return module + except Exception as e: + print(f"Warning: Could not import module {module_name}: {e}") + return None + + +def get_doc_from_class(module, class_name: str) -> str: + """Get the doc property from an instantiated class.""" + if module is None: + return None + + cls = getattr(module, class_name, None) + if cls is None: + return None + + try: + instance = cls() + if hasattr(instance, "doc"): + return instance.doc + except Exception as e: + print(f"Warning: Could not instantiate {class_name}: {e}") + + return None + + +def find_auto_docstring_classes(filepath: str) -> list: + """ + Find all classes in a file that have # auto_docstring comment above them. + + Returns list of (class_name, class_line_number, has_existing_docstring, docstring_end_line) + """ + with open(filepath, "r", encoding="utf-8", newline="\n") as f: + lines = f.readlines() + + # Parse AST to find class locations and their docstrings + content = "".join(lines) + try: + tree = ast.parse(content) + except SyntaxError as e: + print(f"Syntax error in {filepath}: {e}") + return [] + + # Build a map of class_name -> (class_line, has_docstring, docstring_end_line) + class_info = {} + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + has_docstring = False + docstring_end_line = node.lineno # default to class line + + if node.body and isinstance(node.body[0], ast.Expr): + first_stmt = node.body[0] + if isinstance(first_stmt.value, ast.Constant) and isinstance(first_stmt.value.value, str): + has_docstring = True + docstring_end_line = first_stmt.end_lineno or first_stmt.lineno + + class_info[node.name] = (node.lineno, has_docstring, docstring_end_line) + + # Now scan for # auto_docstring comments + classes_to_update = [] + + for i, line in enumerate(lines): + if AUTO_DOCSTRING_PATTERN.match(line): + # Found the marker, look for class definition on next non-empty, non-comment line + j = i + 1 + while j < len(lines): + next_line = lines[j].strip() + if next_line and not next_line.startswith("#"): + break + j += 1 + + if j < len(lines) and lines[j].strip().startswith("class "): + # Extract class name + match = re.match(r"class\s+(\w+)", lines[j].strip()) + if match: + class_name = match.group(1) + if class_name in class_info: + class_line, has_docstring, docstring_end_line = class_info[class_name] + classes_to_update.append(( + class_name, + class_line, + has_docstring, + docstring_end_line + )) + + return classes_to_update + + +def format_docstring(doc: str, indent: str = " ") -> str: + """Format a doc string as a properly indented docstring.""" + lines = doc.strip().split("\n") + + if len(lines) == 1: + return f'{indent}"""{lines[0]}"""\n' + else: + result = [f'{indent}"""\n'] + for line in lines: + if line.strip(): + result.append(f"{indent}{line}\n") + else: + result.append("\n") + result.append(f'{indent}"""\n') + return "".join(result) + + +def process_file(filepath: str, overwrite: bool = False) -> list: + """ + Process a file and find/insert docstrings for # auto_docstring marked classes. + + Returns list of classes that need updating. + """ + classes_to_update = find_auto_docstring_classes(filepath) + + if not classes_to_update: + return [] + + if not overwrite: + # Just return the list of classes that need updating + return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] + + # Load the module to get doc properties + module = load_module(filepath) + + with open(filepath, "r", encoding="utf-8", newline="\n") as f: + lines = f.readlines() + + # Process in reverse order to maintain line numbers + updated = False + for class_name, class_line, has_docstring, docstring_end_line in reversed(classes_to_update): + doc = get_doc_from_class(module, class_name) + + if doc is None: + print(f"Warning: Could not get doc for {class_name} in {filepath}") + continue + + # Format the new docstring with 4-space indent + new_docstring = format_docstring(doc, " ") + + if has_docstring: + # Replace existing docstring (line after class definition to docstring_end_line) + # class_line is 1-indexed, we want to replace from class_line+1 to docstring_end_line + lines = lines[:class_line] + [new_docstring] + lines[docstring_end_line:] + else: + # Insert new docstring right after class definition line + # class_line is 1-indexed, so lines[class_line-1] is the class line + # Insert at position class_line (which is right after the class line) + lines = lines[:class_line] + [new_docstring] + lines[class_line:] + + updated = True + print(f"Updated docstring for {class_name} in {filepath}") + + if updated: + with open(filepath, "w", encoding="utf-8", newline="\n") as f: + f.writelines(lines) + + return [(filepath, cls_name, line) for cls_name, line, _, _ in classes_to_update] + + +def check_auto_docstrings(path: str = None, overwrite: bool = False): + """ + Check all files for # auto_docstring markers and optionally fix them. + """ + if path is None: + path = DIFFUSERS_PATH + + if os.path.isfile(path): + all_files = [path] + else: + all_files = glob.glob(os.path.join(path, "**/*.py"), recursive=True) + + all_markers = [] + + for filepath in all_files: + markers = process_file(filepath, overwrite) + all_markers.extend(markers) + + if not overwrite and len(all_markers) > 0: + message = "\n".join([f"- {f}: {cls} at line {line}" for f, cls, line in all_markers]) + raise ValueError( + f"Found the following # auto_docstring markers that need docstrings:\n{message}\n\n" + f"Run `python utils/modular_auto_docstring.py --fix_and_overwrite` to fix them." + ) + + if overwrite and len(all_markers) > 0: + print(f"\nUpdated {len(all_markers)} docstring(s).") + elif len(all_markers) == 0: + print("No # auto_docstring markers found.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check and fix # auto_docstring markers in modular pipeline blocks", + ) + parser.add_argument( + "path", + nargs="?", + default=None, + help="File or directory to process (default: src/diffusers)" + ) + parser.add_argument( + "--fix_and_overwrite", + action="store_true", + help="Whether to fix the docstrings by inserting them from doc property.", + ) + + args = parser.parse_args() + + check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file