mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
address feedbacks
This commit is contained in:
@@ -711,7 +711,7 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115):
|
||||
|
||||
formatted_params.append(param_str)
|
||||
|
||||
return "\n\n".join(formatted_params)
|
||||
return "\n".join(formatted_params)
|
||||
|
||||
|
||||
def format_input_params(input_params, indent_level=4, max_line_length=115):
|
||||
@@ -781,7 +781,7 @@ def format_components(components, indent_level=4, max_line_length=115, add_empty
|
||||
loading_field_values = []
|
||||
for field_name in component.loading_fields():
|
||||
field_value = getattr(component, field_name)
|
||||
if field_value is not None:
|
||||
if field_value:
|
||||
loading_field_values.append(f"{field_name}={field_value}")
|
||||
|
||||
# Add loading field information if available
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -59,55 +59,46 @@ logger = logging.get_logger(__name__)
|
||||
# auto_docstring
|
||||
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditVLEncoderStep
|
||||
|
||||
QwenImage-Edit VL encoder step that encode the image and text prompts together.
|
||||
QwenImage-Edit VL encoder step that encode the image and text prompts together.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
Configs:
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
|
||||
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
|
||||
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
|
||||
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
|
||||
<|im_start|>user
|
||||
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode_start_idx (default: 64)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
prompt (`str`):
|
||||
The prompt or prompts to guide image generation.
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
prompt_embeds (`Tensor`):
|
||||
The prompt embeddings
|
||||
|
||||
prompt_embeds_mask (`Tensor`):
|
||||
The encoder attention mask
|
||||
|
||||
negative_prompt_embeds (`Tensor`):
|
||||
The negative prompt embeddings
|
||||
|
||||
negative_prompt_embeds_mask (`Tensor`):
|
||||
The negative prompt embeddings mask
|
||||
"""
|
||||
@@ -133,33 +124,26 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditVaeEncoderStep
|
||||
|
||||
Vae encoder step that encode the image inputs into their latent representations.
|
||||
Vae encoder step that encode the image inputs into their latent representations.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
processed_image (`None`):
|
||||
|
||||
image_latents (`Tensor`):
|
||||
The latents representing the reference image(s). Single tensor or list depending on input.
|
||||
"""
|
||||
@@ -181,47 +165,36 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInpaintVaeEncoderStep
|
||||
|
||||
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
|
||||
This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:
|
||||
- resize the image for target area (1024 * 1024) while maintaining the aspect ratio.
|
||||
- process the resized image and mask image.
|
||||
- create image latents.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
image_mask_processor (`InpaintProcessor`) [subfolder=]
|
||||
image_mask_processor (`InpaintProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
mask_image (`Image`):
|
||||
Mask image for inpainting.
|
||||
|
||||
padding_mask_crop (`int`, *optional*):
|
||||
Padding for mask cropping in inpainting.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
processed_image (`None`):
|
||||
|
||||
processed_mask_image (`None`):
|
||||
|
||||
mask_overlay_kwargs (`Dict`):
|
||||
The kwargs for the postprocess step to apply the mask overlay
|
||||
|
||||
image_latents (`Tensor`):
|
||||
The latents representing the reference image(s). Single tensor or list depending on input.
|
||||
"""
|
||||
@@ -270,48 +243,34 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInputStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInputStep
|
||||
|
||||
Input step that prepares the inputs for the edit denoising step. It:
|
||||
Input step that prepares the inputs for the edit denoising step. It:
|
||||
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||
- update height/width based `image_latents`, patchify `image_latents`.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
batch_size (`int`):
|
||||
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
|
||||
|
||||
dtype (`dtype`):
|
||||
Data type of model tensor inputs (determined by `prompt_embeds`)
|
||||
|
||||
image_height (`int`):
|
||||
The image height calculated from the image latents dimension
|
||||
|
||||
image_width (`int`):
|
||||
The image width calculated from the image latents dimension
|
||||
"""
|
||||
@@ -335,50 +294,35 @@ class QwenImageEditInputStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInpaintInputStep
|
||||
|
||||
Input step that prepares the inputs for the edit inpaint denoising step. It:
|
||||
Input step that prepares the inputs for the edit inpaint denoising step. It:
|
||||
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||
- update height/width based `image_latents`, patchify `image_latents`.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
processed_mask_image (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
batch_size (`int`):
|
||||
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
|
||||
|
||||
dtype (`dtype`):
|
||||
Data type of model tensor inputs (determined by `prompt_embeds`)
|
||||
|
||||
image_height (`int`):
|
||||
The image height calculated from the image latents dimension
|
||||
|
||||
image_width (`int`):
|
||||
The image width calculated from the image latents dimension
|
||||
"""
|
||||
@@ -405,44 +349,32 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInpaintPrepareLatentsStep
|
||||
|
||||
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
|
||||
This step prepares the latents/image_latents and mask inputs for the edit inpainting denoising step. It:
|
||||
- Add noise to the image latents to create the latents input for the denoiser.
|
||||
- Create the patchified latents `mask` based on the processed mask image.
|
||||
|
||||
Components:
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
Inputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
The initial random noised, can be generated in prepare latent step.
|
||||
|
||||
image_latents (`Tensor`):
|
||||
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
|
||||
step.
|
||||
|
||||
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
|
||||
timesteps (`Tensor`):
|
||||
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
|
||||
|
||||
processed_mask_image (`Tensor`):
|
||||
The processed mask to use for the inpainting process.
|
||||
|
||||
height (`None`):
|
||||
|
||||
width (`None`):
|
||||
|
||||
dtype (`None`):
|
||||
|
||||
Outputs:
|
||||
|
||||
initial_noise (`Tensor`):
|
||||
The initial random noised used for inpainting denoising.
|
||||
|
||||
mask (`Tensor`):
|
||||
The mask to use for the inpainting process.
|
||||
"""
|
||||
@@ -464,61 +396,44 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditCoreDenoiseStep
|
||||
|
||||
Core denoising workflow for QwenImage-Edit edit (img2img) task.
|
||||
Core denoising workflow for QwenImage-Edit edit (img2img) task.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
Outputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
Denoised latents.
|
||||
"""
|
||||
@@ -556,66 +471,47 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInpaintCoreDenoiseStep
|
||||
|
||||
Core denoising workflow for QwenImage-Edit edit inpaint task.
|
||||
Core denoising workflow for QwenImage-Edit edit inpaint task.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
processed_mask_image (`None`, *optional*):
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
strength (`float`, *optional*, defaults to 0.9):
|
||||
Strength for img2img/inpainting.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
Outputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
Denoised latents.
|
||||
"""
|
||||
@@ -694,26 +590,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditDecodeStep
|
||||
|
||||
Decode step that decodes the latents to images and postprocess the generated image.
|
||||
Decode step that decodes the latents to images and postprocess the generated image.
|
||||
|
||||
Components:
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
Inputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
The latents to decode, can be generated in the denoise step
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
@@ -731,29 +622,22 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditInpaintDecodeStep
|
||||
|
||||
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
|
||||
overlay to the original image.
|
||||
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
|
||||
|
||||
Components:
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
image_mask_processor (`InpaintProcessor`) [subfolder=]
|
||||
image_mask_processor (`InpaintProcessor`)
|
||||
|
||||
Inputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
The latents to decode, can be generated in the denoise step
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
mask_overlay_kwargs (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
@@ -806,103 +690,81 @@ EDIT_AUTO_BLOCKS = InsertableDict(
|
||||
# auto_docstring
|
||||
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditAutoBlocks
|
||||
|
||||
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
|
||||
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
|
||||
- for edit (img2img) generation, you need to provide `image`
|
||||
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
|
||||
`padding_mask_crop`
|
||||
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
image_mask_processor (`InpaintProcessor`) [subfolder=]
|
||||
image_mask_processor (`InpaintProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Configs:
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
|
||||
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
|
||||
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
|
||||
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
|
||||
<|im_start|>user
|
||||
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode_start_idx (default: 64)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
prompt (`str`):
|
||||
The prompt or prompts to guide image generation.
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
mask_image (`Image`, *optional*):
|
||||
Mask image for inpainting.
|
||||
|
||||
padding_mask_crop (`int`, *optional*):
|
||||
Padding for mask cropping in inpainting.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
height (`int`):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`):
|
||||
|
||||
processed_mask_image (`None`, *optional*):
|
||||
|
||||
latents (`Tensor`):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
num_inference_steps (`int`):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
strength (`float`, *optional*, defaults to 0.9):
|
||||
Strength for img2img/inpainting.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
mask_overlay_kwargs (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
|
||||
@@ -52,57 +52,48 @@ logger = logging.get_logger(__name__)
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusVLEncoderStep
|
||||
|
||||
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
|
||||
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
Configs:
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
|
||||
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
|
||||
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
|
||||
{}<|im_end|> <|im_start|>assistant )
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
|
||||
<|im_start|>user
|
||||
{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
|
||||
|
||||
prompt_template_encode_start_idx (default: 64)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
prompt (`str`):
|
||||
The prompt or prompts to guide image generation.
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_cond_image (`List`):
|
||||
The resized images
|
||||
|
||||
prompt_embeds (`Tensor`):
|
||||
The prompt embeddings
|
||||
|
||||
prompt_embeds_mask (`Tensor`):
|
||||
The encoder attention mask
|
||||
|
||||
negative_prompt_embeds (`Tensor`):
|
||||
The negative prompt embeddings
|
||||
|
||||
negative_prompt_embeds_mask (`Tensor`):
|
||||
The negative prompt embeddings mask
|
||||
"""
|
||||
@@ -127,34 +118,27 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusVaeEncoderStep
|
||||
|
||||
VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
|
||||
on its own aspect ratio to 1024x1024 target area.
|
||||
VAE encoder step that encodes image inputs into latent representations.
|
||||
Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
processed_image (`None`):
|
||||
|
||||
image_latents (`Tensor`):
|
||||
The latents representing the reference image(s). Single tensor or list depending on input.
|
||||
"""
|
||||
@@ -184,9 +168,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusInputStep
|
||||
|
||||
Input step that prepares the inputs for the Edit Plus denoising step. It:
|
||||
Input step that prepares the inputs for the Edit Plus denoising step. It:
|
||||
- Standardizes text embeddings batch size.
|
||||
- Processes list of image latents: patchifies, concatenates along dim=1, expands batch.
|
||||
- Outputs lists of image_height/image_width for RoPE calculation.
|
||||
@@ -194,40 +176,28 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
batch_size (`int`):
|
||||
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
|
||||
|
||||
dtype (`dtype`):
|
||||
Data type of model tensor inputs (determined by `prompt_embeds`)
|
||||
|
||||
image_height (`List`):
|
||||
The image heights calculated from the image latents dimension
|
||||
|
||||
image_width (`List`):
|
||||
The image widths calculated from the image latents dimension
|
||||
"""
|
||||
@@ -254,61 +224,44 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusCoreDenoiseStep
|
||||
|
||||
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
|
||||
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
Outputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
Denoised latents.
|
||||
"""
|
||||
@@ -350,26 +303,21 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusDecodeStep
|
||||
|
||||
Decode step that decodes the latents to images and postprocesses the generated image.
|
||||
Decode step that decodes the latents to images and postprocesses the generated image.
|
||||
|
||||
Components:
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
Inputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
The latents to decode, can be generated in the denoise step
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
@@ -400,88 +348,73 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
|
||||
# auto_docstring
|
||||
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageEditPlusAutoBlocks
|
||||
|
||||
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
|
||||
Auto Modular pipeline for edit (img2img) tasks using QwenImage-Edit Plus.
|
||||
- `image` is required input (can be single image or list of images).
|
||||
- Each image is resized independently based on its own aspect ratio.
|
||||
- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
pachifier (`QwenImagePachifier`) [subfolder=]
|
||||
pachifier (`QwenImagePachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Configs:
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
|
||||
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
|
||||
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
|
||||
{}<|im_end|> <|im_start|>assistant )
|
||||
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
|
||||
<|im_start|>user
|
||||
{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
|
||||
|
||||
prompt_template_encode_start_idx (default: 64)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
prompt (`str`):
|
||||
The prompt or prompts to guide image generation.
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
height (`int`, *optional*):
|
||||
The height in pixels of the generated image.
|
||||
|
||||
width (`int`, *optional*):
|
||||
The width in pixels of the generated image.
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
|
||||
@@ -53,43 +53,45 @@ logger = logging.get_logger(__name__)
|
||||
# auto_docstring
|
||||
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageLayeredTextEncoderStep
|
||||
|
||||
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
|
||||
provided.
|
||||
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
|
||||
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
Configs:
|
||||
|
||||
image_caption_prompt_en (default: <|im_start|>system
|
||||
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
|
||||
Please write an image caption based on the input image:
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
# Image Annotator
|
||||
You are a professional image annotator. Please write an image caption based on the input image:
|
||||
1. Write the caption using natural, descriptive language without structured formats or rich text.
|
||||
2. Enrich caption details by including:
|
||||
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
|
||||
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
|
||||
attachment relations, action relations, comparative relations, causal relations, and so on
|
||||
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
|
||||
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
|
||||
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
|
||||
caption with quotation marks
|
||||
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
|
||||
3. Maintain authenticity and accuracy:
|
||||
- Avoid generalizations
|
||||
- Describe all visible information in the image, while do not add information not explicitly shown in the image
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
image_caption_prompt_cn (default: <|im_start|>system
|
||||
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
# 图像标注器
|
||||
你是一个专业的图像标注器。请基于输入图像,撰写图注:
|
||||
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
|
||||
2. 通过加入以下内容,丰富图注细节:
|
||||
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
|
||||
@@ -99,50 +101,44 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
||||
3. 保持真实性与准确性:
|
||||
- 不要使用笼统的描述
|
||||
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
|
||||
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
|
||||
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
|
||||
<|im_start|>user
|
||||
{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode_start_idx (default: 34)
|
||||
|
||||
tokenizer_max_length (default: 1024)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
resolution (`int`, *optional*, defaults to 640):
|
||||
The target area to resize the image to, can be 1024 or 640
|
||||
|
||||
prompt (`str`, *optional*):
|
||||
The prompt to encode
|
||||
|
||||
use_en_prompt (`bool`, *optional*, defaults to False):
|
||||
Whether to use English prompt template
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||
Maximum sequence length for prompt encoding.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
prompt_embeds (`Tensor`):
|
||||
The prompt embeddings
|
||||
|
||||
prompt_embeds_mask (`Tensor`):
|
||||
The encoder attention mask
|
||||
|
||||
negative_prompt_embeds (`Tensor`):
|
||||
The negative prompt embeddings
|
||||
|
||||
negative_prompt_embeds_mask (`Tensor`):
|
||||
The negative prompt embeddings mask
|
||||
"""
|
||||
@@ -169,36 +165,28 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageLayeredVaeEncoderStep
|
||||
|
||||
Vae encoder step that encode the image inputs into their latent representations.
|
||||
Vae encoder step that encode the image inputs into their latent representations.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
resolution (`int`, *optional*, defaults to 640):
|
||||
The target area to resize the image to, can be 1024 or 640
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
Outputs:
|
||||
|
||||
resized_image (`List`):
|
||||
The resized images
|
||||
|
||||
processed_image (`None`):
|
||||
|
||||
image_latents (`Tensor`):
|
||||
The latents representing the reference image(s). Single tensor or list depending on input.
|
||||
"""
|
||||
@@ -226,48 +214,34 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageLayeredInputStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageLayeredInputStep
|
||||
|
||||
Input step that prepares the inputs for the layered denoising step. It:
|
||||
Input step that prepares the inputs for the layered denoising step. It:
|
||||
- make sure the text embeddings have consistent batch size as well as the additional inputs.
|
||||
- update height/width based `image_latents`, patchify `image_latents`.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
|
||||
pachifier (`QwenImageLayeredPachifier`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
Outputs:
|
||||
|
||||
batch_size (`int`):
|
||||
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
|
||||
|
||||
dtype (`dtype`):
|
||||
Data type of model tensor inputs (determined by `prompt_embeds`)
|
||||
|
||||
image_height (`int`):
|
||||
The image height calculated from the image latents dimension
|
||||
|
||||
image_width (`int`):
|
||||
The image width calculated from the image latents dimension
|
||||
|
||||
height (`int`):
|
||||
The height of the image output
|
||||
|
||||
width (`int`):
|
||||
The width of the image output
|
||||
"""
|
||||
@@ -292,58 +266,42 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
|
||||
# auto_docstring
|
||||
class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageLayeredCoreDenoiseStep
|
||||
|
||||
Core denoising workflow for QwenImage-Layered img2img task.
|
||||
Core denoising workflow for QwenImage-Layered img2img task.
|
||||
|
||||
Components:
|
||||
|
||||
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
|
||||
pachifier (`QwenImageLayeredPachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Inputs:
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
prompt_embeds (`None`):
|
||||
|
||||
prompt_embeds_mask (`None`):
|
||||
|
||||
negative_prompt_embeds (`None`, *optional*):
|
||||
|
||||
negative_prompt_embeds_mask (`None`, *optional*):
|
||||
|
||||
image_latents (`None`, *optional*):
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
layers (`int`, *optional*, defaults to 4):
|
||||
Number of layers to extract from the image
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
Outputs:
|
||||
|
||||
latents (`Tensor`):
|
||||
Denoised latents.
|
||||
"""
|
||||
@@ -394,52 +352,55 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
|
||||
# auto_docstring
|
||||
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
|
||||
"""
|
||||
class QwenImageLayeredAutoBlocks
|
||||
|
||||
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
|
||||
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
|
||||
|
||||
Components:
|
||||
|
||||
image_resize_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_resize_processor (`VaeImageProcessor`)
|
||||
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`) [subfolder=]
|
||||
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
|
||||
|
||||
processor (`Qwen2VLProcessor`) [subfolder=]
|
||||
processor (`Qwen2VLProcessor`)
|
||||
|
||||
tokenizer (`Qwen2Tokenizer`): The tokenizer to use [subfolder=]
|
||||
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
|
||||
|
||||
guider (`ClassifierFreeGuidance`) [subfolder=]
|
||||
guider (`ClassifierFreeGuidance`)
|
||||
|
||||
image_processor (`VaeImageProcessor`) [subfolder=]
|
||||
image_processor (`VaeImageProcessor`)
|
||||
|
||||
vae (`AutoencoderKLQwenImage`) [subfolder=]
|
||||
vae (`AutoencoderKLQwenImage`)
|
||||
|
||||
pachifier (`QwenImageLayeredPachifier`) [subfolder=]
|
||||
pachifier (`QwenImageLayeredPachifier`)
|
||||
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`) [subfolder=]
|
||||
scheduler (`FlowMatchEulerDiscreteScheduler`)
|
||||
|
||||
transformer (`QwenImageTransformer2DModel`) [subfolder=]
|
||||
transformer (`QwenImageTransformer2DModel`)
|
||||
|
||||
Configs:
|
||||
|
||||
image_caption_prompt_en (default: <|im_start|>system
|
||||
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
|
||||
Please write an image caption based on the input image:
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
# Image Annotator
|
||||
You are a professional image annotator. Please write an image caption based on the input image:
|
||||
1. Write the caption using natural, descriptive language without structured formats or rich text.
|
||||
2. Enrich caption details by including:
|
||||
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
|
||||
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
|
||||
attachment relations, action relations, comparative relations, causal relations, and so on
|
||||
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
|
||||
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
|
||||
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
|
||||
caption with quotation marks
|
||||
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
|
||||
3. Maintain authenticity and accuracy:
|
||||
- Avoid generalizations
|
||||
- Describe all visible information in the image, while do not add information not explicitly shown in the image
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
image_caption_prompt_cn (default: <|im_start|>system
|
||||
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
|
||||
You are a helpful assistant.<|im_end|>
|
||||
<|im_start|>user
|
||||
# 图像标注器
|
||||
你是一个专业的图像标注器。请基于输入图像,撰写图注:
|
||||
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
|
||||
2. 通过加入以下内容,丰富图注细节:
|
||||
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
|
||||
@@ -449,65 +410,54 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
|
||||
3. 保持真实性与准确性:
|
||||
- 不要使用笼统的描述
|
||||
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
|
||||
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode (default: <|im_start|>system
|
||||
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
|
||||
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
|
||||
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
|
||||
<|im_start|>user
|
||||
{}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
)
|
||||
|
||||
prompt_template_encode_start_idx (default: 34)
|
||||
|
||||
tokenizer_max_length (default: 1024)
|
||||
|
||||
Inputs:
|
||||
|
||||
image (`Image`):
|
||||
Input image for img2img, editing, or conditioning.
|
||||
|
||||
resolution (`int`, *optional*, defaults to 640):
|
||||
The target area to resize the image to, can be 1024 or 640
|
||||
|
||||
prompt (`str`, *optional*):
|
||||
The prompt to encode
|
||||
|
||||
use_en_prompt (`bool`, *optional*, defaults to False):
|
||||
Whether to use English prompt template
|
||||
|
||||
negative_prompt (`str`, *optional*):
|
||||
The prompt or prompts not to guide the image generation.
|
||||
|
||||
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||
Maximum sequence length for prompt encoding.
|
||||
|
||||
generator (`Generator`, *optional*):
|
||||
Torch generator for deterministic generation.
|
||||
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
|
||||
latents (`Tensor`, *optional*):
|
||||
Pre-generated noisy latents for image generation.
|
||||
|
||||
layers (`int`, *optional*, defaults to 4):
|
||||
Number of layers to extract from the image
|
||||
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps.
|
||||
|
||||
sigmas (`List`, *optional*):
|
||||
Custom sigmas for the denoising process.
|
||||
|
||||
attention_kwargs (`Dict`, *optional*):
|
||||
Additional kwargs for attention processors.
|
||||
|
||||
**denoiser_input_fields (`Tensor`, *optional*):
|
||||
conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
|
||||
|
||||
output_type (`str`, *optional*, defaults to pil):
|
||||
Output format: 'pil', 'np', 'pt''.
|
||||
|
||||
Outputs:
|
||||
|
||||
images (`List`):
|
||||
Generated images.
|
||||
"""
|
||||
|
||||
@@ -169,6 +169,17 @@ def find_auto_docstring_classes(filepath: str) -> list:
|
||||
return classes_to_update
|
||||
|
||||
|
||||
def strip_class_name_line(doc: str, class_name: str) -> str:
|
||||
"""Remove the 'class ClassName' line from the doc if present."""
|
||||
lines = doc.strip().split("\n")
|
||||
if lines and lines[0].strip() == f"class {class_name}":
|
||||
# Remove the class line and any blank line following it
|
||||
lines = lines[1:]
|
||||
while lines and not lines[0].strip():
|
||||
lines = lines[1:]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def format_docstring(doc: str, indent: str = " ") -> str:
|
||||
"""Format a doc string as a properly indented docstring."""
|
||||
lines = doc.strip().split("\n")
|
||||
@@ -216,6 +227,9 @@ def process_file(filepath: str, overwrite: bool = False) -> list:
|
||||
print(f"Warning: Could not get doc for {class_name} in {filepath}")
|
||||
continue
|
||||
|
||||
# Remove the "class ClassName" line since it's redundant in a docstring
|
||||
doc = strip_class_name_line(doc, class_name)
|
||||
|
||||
# Format the new docstring with 4-space indent
|
||||
new_docstring = format_docstring(doc, " ")
|
||||
|
||||
@@ -283,4 +297,4 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
check_auto_docstrings(args.path, args.fix_and_overwrite)
|
||||
check_auto_docstrings(args.path, args.fix_and_overwrite)
|
||||
Reference in New Issue
Block a user