1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00

address feedbacks

This commit is contained in:
yiyixuxu
2026-01-17 09:36:58 +01:00
parent 1c90ce33f2
commit aea0d046f6
6 changed files with 271 additions and 750 deletions

View File

@@ -711,7 +711,7 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
formatted_params.append(param_str)
return "\n\n".join(formatted_params)
return "\n".join(formatted_params)
def format_input_params(input_params, indent_level=4, max_line_length=115):
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
loading_field_values = []
for field_name in component.loading_fields():
field_value = getattr(component, field_name)
if field_value is not None:
if field_value:
loading_field_values.append(f"{field_name}={field_value}")
# Add loading field information if available

View File

@@ -59,55 +59,46 @@ logger = logging.get_logger(__name__)
# auto_docstring
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditVLEncoderStep
QwenImage-Edit VL encoder step that encode the image and text prompts together.
QwenImage-Edit VL encoder step that encode the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
<|im_start|>assistant
)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
Outputs:
resized_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
@@ -133,33 +124,26 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditVaeEncoderStep
Vae encoder step that encode the image inputs into their latent representations.
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
@@ -181,47 +165,36 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintVaeEncoderStep
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
- resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
- process the resized image and mask image.
- create image latents.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
image_mask_processor (`InpaintProcessor`) [subfolder=]
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
mask_image (`Image`):
Mask image for inpainting.
padding_mask_crop (`int`, *optional*):
Padding for mask cropping in inpainting.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
processed_mask_image (`None`):
mask_overlay_kwargs (`Dict`):
The kwargs for the postprocess step to apply the mask overlay
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
@@ -270,48 +243,34 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
# auto_docstring
class QwenImageEditInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditInputStep
Input step that prepares the inputs for the edit denoising step. It:
Input step that prepares the inputs for the edit denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
"""
@@ -335,50 +294,35 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintInputStep
Input step that prepares the inputs for the edit inpaint denoising step. It:
Input step that prepares the inputs for the edit inpaint denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
processed_mask_image (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
"""
@@ -405,44 +349,32 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintPrepareLatentsStep
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
- Add noise to the image latents to create the latents input for the denoiser.
- Create the patchified latents `mask` based on the processed mask image.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
Inputs:
latents (`Tensor`):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
step.
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
processed_mask_image (`Tensor`):
The processed mask to use for the inpainting process.
height (`None`):
width (`None`):
dtype (`None`):
Outputs:
initial_noise (`Tensor`):
The initial random noised used for inpainting denoising.
mask (`Tensor`):
The mask to use for the inpainting process.
"""
@@ -464,61 +396,44 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditCoreDenoiseStep
Core denoising workflow for QwenImage-Edit edit (img2img) task.
Core denoising workflow for QwenImage-Edit edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
@@ -556,66 +471,47 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintCoreDenoiseStep
Core denoising workflow for QwenImage-Edit edit inpaint task.
Core denoising workflow for QwenImage-Edit edit inpaint task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
processed_mask_image (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
@@ -694,26 +590,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
# auto_docstring
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditDecodeStep
Decode step that decodes the latents to images and postprocess the generated image.
Decode step that decodes the latents to images and postprocess the generated image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
@@ -731,29 +622,22 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintDecodeStep
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
overlay to the original image.
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
image_mask_processor (`InpaintProcessor`) [subfolder=]
image_mask_processor (`InpaintProcessor`)
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
mask_overlay_kwargs (`None`, *optional*):
Outputs:
images (`List`):
Generated images.
"""
@@ -806,103 +690,81 @@ EDIT_AUTO_BLOCKS = InsertableDict(
# auto_docstring
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageEditAutoBlocks
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
- for edit (img2img) generation, you need to provide `image`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
`padding_mask_crop`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
image_mask_processor (`InpaintProcessor`) [subfolder=]
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
<|im_start|>assistant
)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
mask_image (`Image`, *optional*):
Mask image for inpainting.
padding_mask_crop (`int`, *optional*):
Padding for mask cropping in inpainting.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
height (`int`):
The height in pixels of the generated image.
width (`int`):
The width in pixels of the generated image.
image_latents (`None`):
processed_mask_image (`None`, *optional*):
latents (`Tensor`):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
mask_overlay_kwargs (`None`, *optional*):
Outputs:
images (`List`):
Generated images.
"""

View File

@@ -52,57 +52,48 @@ logger = logging.get_logger(__name__)
# auto_docstring
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusVLEncoderStep
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
Outputs:
resized_cond_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
@@ -127,34 +118,27 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusVaeEncoderStep
VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
on its own aspect ratio to 1024x1024 target area.
VAE encoder step that encodes image inputs into latent representations.
Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
@@ -184,9 +168,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusInputStep
Input step that prepares the inputs for the Edit Plus denoising step. It:
Input step that prepares the inputs for the Edit Plus denoising step. It:
- Standardizes text embeddings batch size.
- Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
- Outputs lists of image_height/image_width for RoPE calculation.
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`List`):
The image heights calculated from the image latents dimension
image_width (`List`):
The image widths calculated from the image latents dimension
"""
@@ -254,61 +224,44 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusCoreDenoiseStep
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
@@ -350,26 +303,21 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusDecodeStep
Decode step that decodes the latents to images and postprocesses the generated image.
Decode step that decodes the latents to images and postprocesses the generated image.
Components:
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The latents to decode, can be generated in the denoise step
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""
@@ -400,88 +348,73 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
# auto_docstring
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageEditPlusAutoBlocks
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
- `image` is required input (can be single image or list of images).
- Each image is resized independently based on its own aspect ratio.
- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
pachifier (`QwenImagePachifier`) [subfolder=]
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
prompt_template_encode_start_idx (default: 64)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
prompt (`str`):
The prompt or prompts to guide image generation.
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
The width in pixels of the generated image.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""

View File

@@ -53,43 +53,45 @@ logger = logging.get_logger(__name__)
# auto_docstring
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredTextEncoderStep
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
provided.
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
You are a helpful assistant.<|im_end|>
<|im_start|>user
# Image Annotator
You are a professional image annotator. Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
You are a helpful assistant.<|im_end|>
<|im_start|>user
# 图像标注器
你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
prompt_template_encode_start_idx (default: 34)
tokenizer_max_length (default: 1024)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
prompt (`str`, *optional*):
The prompt to encode
use_en_prompt (`bool`, *optional*, defaults to False):
Whether to use English prompt template
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
max_sequence_length (`int`, *optional*, defaults to 1024):
Maximum sequence length for prompt encoding.
Outputs:
resized_image (`List`):
The resized images
prompt_embeds (`Tensor`):
The prompt embeddings
prompt_embeds_mask (`Tensor`):
The encoder attention mask
negative_prompt_embeds (`Tensor`):
The negative prompt embeddings
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask
"""
@@ -169,36 +165,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredVaeEncoderStep
Vae encoder step that encode the image inputs into their latent representations.
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
Outputs:
resized_image (`List`):
The resized images
processed_image (`None`):
image_latents (`Tensor`):
The latents representing the reference image(s). Single tensor or list depending on input.
"""
@@ -226,48 +214,34 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageLayeredInputStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredInputStep
Input step that prepares the inputs for the layered denoising step. It:
Input step that prepares the inputs for the layered denoising step. It:
- make sure the text embeddings have consistent batch size as well as the additional inputs.
- update height/width based `image_latents`, patchify `image_latents`.
Components:
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
pachifier (`QwenImageLayeredPachifier`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
image_latents (`None`, *optional*):
Outputs:
batch_size (`int`):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
dtype (`dtype`):
Data type of model tensor inputs (determined by `prompt_embeds`)
image_height (`int`):
The image height calculated from the image latents dimension
image_width (`int`):
The image width calculated from the image latents dimension
height (`int`):
The height of the image output
width (`int`):
The width of the image output
"""
@@ -292,58 +266,42 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredCoreDenoiseStep
Core denoising workflow for QwenImage-Layered img2img task.
Core denoising workflow for QwenImage-Layered img2img task.
Components:
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
pachifier (`QwenImageLayeredPachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
prompt_embeds (`None`):
prompt_embeds_mask (`None`):
negative_prompt_embeds (`None`, *optional*):
negative_prompt_embeds_mask (`None`, *optional*):
image_latents (`None`, *optional*):
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
layers (`int`, *optional*, defaults to 4):
Number of layers to extract from the image
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
@@ -394,52 +352,55 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
# auto_docstring
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
"""
class QwenImageLayeredAutoBlocks
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
Components:
image_resize_processor (`VaeImageProcessor`) [subfolder=]
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`) [subfolder=]
processor (`Qwen2VLProcessor`)
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`) [subfolder=]
guider (`ClassifierFreeGuidance`)
image_processor (`VaeImageProcessor`) [subfolder=]
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) [subfolder=]
vae (`AutoencoderKLQwenImage`)
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
pachifier (`QwenImageLayeredPachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`) [subfolder=]
transformer (`QwenImageTransformer2DModel`)
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
You are a helpful assistant.<|im_end|>
<|im_start|>user
# Image Annotator
You are a professional image annotator. Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
You are a helpful assistant.<|im_end|>
<|im_start|>user
# 图像标注器
你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
prompt_template_encode_start_idx (default: 34)
tokenizer_max_length (default: 1024)
Inputs:
image (`Image`):
Input image for img2img, editing, or conditioning.
resolution (`int`, *optional*, defaults to 640):
The target area to resize the image to, can be 1024 or 640
prompt (`str`, *optional*):
The prompt to encode
use_en_prompt (`bool`, *optional*, defaults to False):
Whether to use English prompt template
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
max_sequence_length (`int`, *optional*, defaults to 1024):
Maximum sequence length for prompt encoding.
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
layers (`int`, *optional*, defaults to 4):
Number of layers to extract from the image
num_inference_steps (`int`, *optional*, defaults to 50):
The number of denoising steps.
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
attention_kwargs (`Dict`, *optional*):
Additional kwargs for attention processors.
**denoiser_input_fields (`Tensor`, *optional*):
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt''.
Outputs:
images (`List`):
Generated images.
"""

View File

@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
return classes_to_update
def strip_class_name_line(doc: str, class_name: str) -> str:
"""Remove the 'class ClassName' line from the doc if present."""
lines = doc.strip().split("\n")
if lines and lines[0].strip() == f"class {class_name}":
# Remove the class line and any blank line following it
lines = lines[1:]
while lines and not lines[0].strip():
lines = lines[1:]
return "\n".join(lines)
def format_docstring(doc: str, indent: str = " ") -> str:
"""Format a doc string as a properly indented docstring."""
lines = doc.strip().split("\n")
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
print(f"Warning: Could not get doc for {class_name} in {filepath}")
continue
# Remove the "class ClassName" line since it's redundant in a docstring
doc = strip_class_name_line(doc, class_name)
# Format the new docstring with 4-space indent
new_docstring = format_docstring(doc, " ")
@@ -283,4 +297,4 @@ if __name__ == "__main__":
args = parser.parse_args()
check_auto_docstrings(args.path, args.fix_and_overwrite)
check_auto_docstrings(args.path, args.fix_and_overwrite)