From aea0d046f6eb759dca55a11bd9c55f89db39b3e4 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 17 Jan 2026 09:36:58 +0100 Subject: [PATCH] address feedbacks --- .../modular_pipeline_utils.py | 4 +- .../qwenimage/modular_blocks_qwenimage.py | 408 ++++-------------- .../modular_blocks_qwenimage_edit.py | 256 +++-------- .../modular_blocks_qwenimage_edit_plus.py | 147 ++----- .../modular_blocks_qwenimage_layered.py | 190 +++----- utils/modular_auto_docstring.py | 16 +- 6 files changed, 271 insertions(+), 750 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index fab7c7193e..368fbbcbd1 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -711,7 +711,7 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115): formatted_params.append(param_str) - return "\n\n".join(formatted_params) + return "\n".join(formatted_params) def format_input_params(input_params, indent_level=4, max_line_length=115): @@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty loading_field_values = [] for field_name in component.loading_fields(): field_value = getattr(component, field_name) - if field_value is not None: + if field_value: loading_field_values.append(f"{field_name}={field_value}") # Add loading field information if available diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 85b77c2a6b..3bd4ae5683 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -62,50 +62,44 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): """ - class QwenImageAutoTextEncoderStep - - Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. + Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - prompt (`str`, *optional*): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. Outputs: - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -130,48 +124,36 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): # auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageInpaintVaeEncoderStep - - This step is used for processing image and mask inputs for inpainting tasks. It: + This step is used for processing image and mask inputs for inpainting tasks. It: - Resizes the image to the target size, based on `height` and `width`. - Processes and updates `image` and `mask_image`. - Creates `image_latents`. Components: - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - mask_image (`Image`): Mask image for inpainting. - image (`Image`): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - processed_mask_image (`None`): - mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -193,34 +175,26 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgVaeEncoderStep - - Vae encoder step that preprocess andencode the image inputs into their latent representations. + Vae encoder step that preprocess andencode the image inputs into their latent representations. Components: - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -255,36 +229,30 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): # auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ - class QwenImageOptionalControlNetVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. + This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) [subfolder=] + control_image_processor (`VaeImageProcessor`) Inputs: - control_image (`Image`, *optional*): Control image for ControlNet conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - control_image_latents (`Tensor`): The latents representing the control image """ @@ -312,46 +280,32 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): # auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgInputStep - - Input step that prepares the inputs for the img2img denoising step. It: + Input step that prepares the inputs for the img2img denoising step. It: Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -370,48 +324,33 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): """ - class QwenImageInpaintInputStep - - Input step that prepares the inputs for the inpainting denoising step. It: + Input step that prepares the inputs for the inpainting denoising step. It: Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -436,44 +375,32 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ - class QwenImageInpaintPrepareLatentsStep - - This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: + This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It: - Add noise to the image latents to create the latents input for the denoiser. - Create the pachified latents `mask` based on the processedmask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - latents (`Tensor`): The initial random noised, can be generated in prepare latent step. - image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. - + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. - processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - width (`None`): - dtype (`None`): Outputs: - initial_noise (`Tensor`): The initial random noised used for inpainting denoising. - mask (`Tensor`): The mask to use for the inpainting process. """ @@ -498,60 +425,43 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageCoreDenoiseStep - - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -589,67 +499,47 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageInpaintCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -689,65 +579,46 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageImg2ImgCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -787,74 +658,53 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetCoreDenoiseStep - - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the - inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - control_image_latents (`None`): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -896,81 +746,57 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetInpaintCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - control_image_latents (`None`): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -1014,79 +840,56 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageControlNetImg2ImgCoreDenoiseStep - - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for - img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - control_image_latents (`None`): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - **denoiser_input_fields (`None`, *optional*): All conditional model inputs for the denoiser. It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -1196,26 +999,21 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): # auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): """ - class QwenImageDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image. + Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -1233,29 +1031,22 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - class QwenImageInpaintDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask - overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ @@ -1302,131 +1093,102 @@ AUTO_BLOCKS = InsertableDict( # auto_docstring class QwenImageAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageAutoBlocks - - Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. + Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. - for image-to-image generation, you need to provide `image` - - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`. - to run the controlnet workflow, you need to provide `control_image` - for text-to-image generation, all you need to provide is `prompt` Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - controlnet (`QwenImageControlNetModel`) [subfolder=] + controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) [subfolder=] + control_image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - prompt (`str`, *optional*): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. - mask_image (`Image`, *optional*): Mask image for inpainting. - image (`Image`, *optional*): Input image for img2img, editing, or conditioning. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - control_image (`Image`, *optional*): Control image for ControlNet conditioning. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - latents (`Tensor`): Pre-generated noisy latents for image generation. - num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - control_image_latents (`None`, *optional*): - control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. - control_guidance_end (`float`, *optional*, defaults to 1.0): When to stop applying ControlNet. - controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 3fcbc8853f..627cfce6ee 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -59,55 +59,46 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditVLEncoderStep - - QwenImage-Edit VL encoder step that encode the image and text prompts together. + QwenImage-Edit VL encoder step that encode the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. Outputs: - resized_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -133,33 +124,26 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. + Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -181,47 +165,36 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintVaeEncoderStep - - This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It: + This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It: - resize the image for target area (1024 * 1024) while maintaining the aspect ratio. - process the resized image and mask image. - create image latents. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - mask_image (`Image`): Mask image for inpainting. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - processed_mask_image (`None`): - mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -270,48 +243,34 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): # auto_docstring class QwenImageEditInputStep(SequentialPipelineBlocks): """ - class QwenImageEditInputStep - - Input step that prepares the inputs for the edit denoising step. It: + Input step that prepares the inputs for the edit denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -335,50 +294,35 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintInputStep - - Input step that prepares the inputs for the edit inpaint denoising step. It: + Input step that prepares the inputs for the edit inpaint denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension """ @@ -405,44 +349,32 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintPrepareLatentsStep - - This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It: + This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It: - Add noise to the image latents to create the latents input for the denoiser. - Create the patchified latents `mask` based on the processed mask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - latents (`Tensor`): The initial random noised, can be generated in prepare latent step. - image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input - step. - + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. - processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. - height (`None`): - width (`None`): - dtype (`None`): Outputs: - initial_noise (`Tensor`): The initial random noised used for inpainting denoising. - mask (`Tensor`): The mask to use for the inpainting process. """ @@ -464,61 +396,44 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit edit (img2img) task. + Core denoising workflow for QwenImage-Edit edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -556,66 +471,47 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit edit inpaint task. + Core denoising workflow for QwenImage-Edit edit inpaint task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - processed_mask_image (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -694,26 +590,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): # auto_docstring class QwenImageEditDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image. + Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -731,29 +622,22 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditInpaintDecodeStep - - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask - overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ @@ -806,103 +690,81 @@ EDIT_AUTO_BLOCKS = InsertableDict( # auto_docstring class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageEditAutoBlocks - - Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. + Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide - `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) [subfolder=] + image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - mask_image (`Image`, *optional*): Mask image for inpainting. - padding_mask_crop (`int`, *optional*): Padding for mask cropping in inpainting. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - height (`int`): The height in pixels of the generated image. - width (`int`): The width in pixels of the generated image. - image_latents (`None`): - processed_mask_image (`None`, *optional*): - latents (`Tensor`): Pre-generated noisy latents for image generation. - num_inference_steps (`int`): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. - mask_overlay_kwargs (`None`, *optional*): Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 0364e394d2..cc07fc1e6a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -52,57 +52,48 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusVLEncoderStep - - QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. + QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. Outputs: - resized_cond_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -127,34 +118,27 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusVaeEncoderStep - - VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based - on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. + Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -184,9 +168,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditPlusInputStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusInputStep - - Input step that prepares the inputs for the Edit Plus denoising step. It: + Input step that prepares the inputs for the Edit Plus denoising step. It: - Standardizes text embeddings batch size. - Processes list of image latents: patchifies, concatenates along dim=1, expands batch. - Outputs lists of image_height/image_width for RoPE calculation. @@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`List`): The image heights calculated from the image latents dimension - image_width (`List`): The image widths calculated from the image latents dimension """ @@ -254,61 +224,44 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusCoreDenoiseStep - - Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. + Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -350,26 +303,21 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): """ - class QwenImageEditPlusDecodeStep - - Decode step that decodes the latents to images and postprocesses the generated image. + Decode step that decodes the latents to images and postprocesses the generated image. Components: - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) Inputs: - latents (`Tensor`): The latents to decode, can be generated in the denoise step - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ @@ -400,88 +348,73 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict( # auto_docstring class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageEditPlusAutoBlocks - - Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus. + Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus. - `image` is required input (can be single image or list of images). - Each image is resized independently based on its own aspect ratio. - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImagePachifier`) [subfolder=] + pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how - the user's text instruction should alter or modify the image. Generate a new image that meets the user's - requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user - {}<|im_end|> <|im_start|>assistant ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) prompt_template_encode_start_idx (default: 64) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - prompt (`str`): The prompt or prompts to guide image generation. - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - height (`int`, *optional*): The height in pixels of the generated image. - width (`int`, *optional*): The width in pixels of the generated image. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 5602fc9b93..7cbc174871 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -53,43 +53,45 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - class QwenImageLayeredTextEncoderStep - - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not - provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - prompt (`str`, *optional*): The prompt to encode - use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. Outputs: - resized_image (`List`): The resized images - prompt_embeds (`Tensor`): The prompt embeddings - prompt_embeds_mask (`Tensor`): The encoder attention mask - negative_prompt_embeds (`Tensor`): The negative prompt embeddings - negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ @@ -169,36 +165,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): """ - class QwenImageLayeredVaeEncoderStep - - Vae encoder step that encode the image inputs into their latent representations. + Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - generator (`Generator`, *optional*): Torch generator for deterministic generation. Outputs: - resized_image (`List`): The resized images - processed_image (`None`): - image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -226,48 +214,34 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): # auto_docstring class QwenImageLayeredInputStep(SequentialPipelineBlocks): """ - class QwenImageLayeredInputStep - - Input step that prepares the inputs for the layered denoising step. It: + Input step that prepares the inputs for the layered denoising step. It: - make sure the text embeddings have consistent batch size as well as the additional inputs. - update height/width based `image_latents`, patchify `image_latents`. Components: - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - image_latents (`None`, *optional*): Outputs: - batch_size (`int`): Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt - dtype (`dtype`): Data type of model tensor inputs (determined by `prompt_embeds`) - image_height (`int`): The image height calculated from the image latents dimension - image_width (`int`): The image width calculated from the image latents dimension - height (`int`): The height of the image output - width (`int`): The width of the image output """ @@ -292,58 +266,42 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): # auto_docstring class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): """ - class QwenImageLayeredCoreDenoiseStep - - Core denoising workflow for QwenImage-Layered img2img task. + Core denoising workflow for QwenImage-Layered img2img task. Components: - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Inputs: - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - prompt_embeds (`None`): - prompt_embeds_mask (`None`): - negative_prompt_embeds (`None`, *optional*): - negative_prompt_embeds_mask (`None`, *optional*): - image_latents (`None`, *optional*): - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. Outputs: - latents (`Tensor`): Denoised latents. """ @@ -394,52 +352,55 @@ LAYERED_AUTO_BLOCKS = InsertableDict( # auto_docstring class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): """ - class QwenImageLayeredAutoBlocks - - Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. + Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. Components: - image_resize_processor (`VaeImageProcessor`) [subfolder=] + image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=] + text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) [subfolder=] + processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=] + tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) [subfolder=] + guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) [subfolder=] + image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) [subfolder=] + vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImageLayeredPachifier`) [subfolder=] + pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=] + scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) [subfolder=] + transformer (`QwenImageTransformer2DModel`) Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. - Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # Image Annotator + You are a professional image annotator. Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, - attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the - caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> + <|im_start|>user + # 图像标注器 + 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> + <|im_start|>assistant + ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the - objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> + <|im_start|>user + {}<|im_end|> + <|im_start|>assistant + ) prompt_template_encode_start_idx (default: 34) tokenizer_max_length (default: 1024) Inputs: - image (`Image`): Input image for img2img, editing, or conditioning. - resolution (`int`, *optional*, defaults to 640): The target area to resize the image to, can be 1024 or 640 - prompt (`str`, *optional*): The prompt to encode - use_en_prompt (`bool`, *optional*, defaults to False): Whether to use English prompt template - negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. - max_sequence_length (`int`, *optional*, defaults to 1024): Maximum sequence length for prompt encoding. - generator (`Generator`, *optional*): Torch generator for deterministic generation. - num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. - layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image - num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. - sigmas (`List`, *optional*): Custom sigmas for the denoising process. - attention_kwargs (`Dict`, *optional*): Additional kwargs for attention processors. - **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. - output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. Outputs: - images (`List`): Generated images. """ diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py index e2d523b2f3..01d984a584 100644 --- a/utils/modular_auto_docstring.py +++ b/utils/modular_auto_docstring.py @@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list: return classes_to_update +def strip_class_name_line(doc: str, class_name: str) -> str: + """Remove the 'class ClassName' line from the doc if present.""" + lines = doc.strip().split("\n") + if lines and lines[0].strip() == f"class {class_name}": + # Remove the class line and any blank line following it + lines = lines[1:] + while lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + def format_docstring(doc: str, indent: str = " ") -> str: """Format a doc string as a properly indented docstring.""" lines = doc.strip().split("\n") @@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list: print(f"Warning: Could not get doc for {class_name} in {filepath}") continue + # Remove the "class ClassName" line since it's redundant in a docstring + doc = strip_class_name_line(doc, class_name) + # Format the new docstring with 4-space indent new_docstring = format_docstring(doc, " ") @@ -283,4 +297,4 @@ if __name__ == "__main__": args = parser.parse_args() - check_auto_docstrings(args.path, args.fix_and_overwrite) + check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file