1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00
This commit is contained in:
yiyixuxu
2026-01-10 12:21:26 +01:00
parent 507953f415
commit 1c90ce33f2
4 changed files with 79 additions and 90 deletions

View File

@@ -77,11 +77,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
@@ -260,8 +257,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
"""
class QwenImageOptionalControlNetVaeEncoderStep
Vae encoder step that encode the image inputs into their latent representations.
This is an auto pipeline block.
Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block.
- `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
- if `control_image` is not provided, step will be skipped.
@@ -458,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
step.
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -503,7 +500,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageCoreDenoiseStep
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
inputs (timesteps, latents, rope inputs etc.).
Components:
@@ -593,7 +591,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageInpaintCoreDenoiseStep
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
inpaint task.
Components:
@@ -692,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageImg2ImgCoreDenoiseStep
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
img2img task.
Components:
@@ -789,7 +789,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageControlNetCoreDenoiseStep
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the
inputs (timesteps, latents, rope inputs etc.).
Components:
@@ -897,7 +898,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageControlNetInpaintCoreDenoiseStep
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
inpaint task.
Components:
@@ -1014,7 +1016,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
"""
class QwenImageControlNetImg2ImgCoreDenoiseStep
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for
img2img task.
Components:
@@ -1232,7 +1235,8 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageInpaintDecodeStep
Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
overally to the original image.
Components:
@@ -1294,6 +1298,7 @@ AUTO_BLOCKS = InsertableDict(
]
)
# auto_docstring
class QwenImageAutoBlocks(SequentialPipelineBlocks):
"""
@@ -1301,7 +1306,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.
- for image-to-image generation, you need to provide `image`
- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
- to run the controlnet workflow, you need to provide `control_image`
- for text-to-image generation, all you need to provide is `prompt`
@@ -1332,11 +1337,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
@@ -1428,6 +1430,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage"
block_classes = AUTO_BLOCKS.values()
@@ -1438,7 +1441,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
return (
"Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
+ "- for image-to-image generation, you need to provide `image`\n"
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n"
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
+ "- for text-to-image generation, all you need to provide is `prompt`"
)

View File

@@ -76,11 +76,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
<|im_start|>assistant
)
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 64)
@@ -424,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.
The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
step.
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -733,7 +733,8 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
"""
class QwenImageEditInpaintDecodeStep
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
overlay to the original image.
Components:
@@ -801,6 +802,7 @@ EDIT_AUTO_BLOCKS = InsertableDict(
]
)
# auto_docstring
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
"""
@@ -808,7 +810,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
- for edit (img2img) generation, you need to provide `image`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
`padding_mask_crop`
Components:
@@ -835,11 +838,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>
<|im_start|>assistant
)
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 64)
@@ -904,6 +906,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit"
block_classes = EDIT_AUTO_BLOCKS.values()
block_names = EDIT_AUTO_BLOCKS.keys()

View File

@@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
@@ -130,8 +129,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageEditPlusVaeEncoderStep
VAE encoder step that encodes image inputs into latent representations.
Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based
on its own aspect ratio to 1024x1024 target area.
Components:
@@ -397,6 +396,7 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict(
]
)
# auto_docstring
class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
"""
@@ -430,11 +430,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
Configs:
prompt_template_encode (default: <|im_start|>system
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how
the user's text instruction should alter or modify the image. Generate a new image that meets the user's
requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user
{}<|im_end|> <|im_start|>assistant )
img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>)
@@ -486,6 +485,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage-edit-plus"
block_classes = EDIT_PLUS_AUTO_BLOCKS.values()
block_names = EDIT_PLUS_AUTO_BLOCKS.keys()

View File

@@ -55,7 +55,8 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
"""
class QwenImageLayeredTextEncoderStep
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
provided.
Components:
@@ -72,28 +73,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
# Image Annotator
You are a professional image annotator. Please write an image caption based on the input image:
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
# 图像标注器
你是一个专业的图像标注器。请基于输入图像,撰写图注:
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -103,16 +99,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
@@ -399,6 +390,7 @@ LAYERED_AUTO_BLOCKS = InsertableDict(
]
)
# auto_docstring
class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
"""
@@ -431,28 +423,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
Configs:
image_caption_prompt_en (default: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
# Image Annotator
You are a professional image annotator. Please write an image caption based on the input image:
You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator.
Please write an image caption based on the input image:
1. Write the caption using natural, descriptive language without structured formats or rich text.
2. Enrich caption details by including:
- Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on
- Vision Relations between objects, such as spatial relations, functional relations, possessive relations,
attachment relations, action relations, comparative relations, causal relations, and so on
- Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks
- Identify the text clearly visible in the image, without translation or explanation, and highlight it in the
caption with quotation marks
3. Maintain authenticity and accuracy:
- Avoid generalizations
- Describe all visible information in the image, while do not add information not explicitly shown in the image
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
image_caption_prompt_cn (default: <|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
# 图像标注器
你是一个专业的图像标注器。请基于输入图像,撰写图注:
You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注:
1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。
2. 通过加入以下内容,丰富图注细节:
- 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等
@@ -462,16 +449,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
3. 保持真实性与准确性:
- 不要使用笼统的描述
- 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容
<|vision_start|><|image_pad|><|vision_end|><|im_end|>
<|im_start|>assistant
)
<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant )
prompt_template_encode (default: <|im_start|>system
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)
Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the
objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant )
prompt_template_encode_start_idx (default: 34)
@@ -529,6 +511,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage-layered"
block_classes = LAYERED_AUTO_BLOCKS.values()
block_names = LAYERED_AUTO_BLOCKS.keys()