From 1c90ce33f2445b29c1967976a1734db97f5eaa3a Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:21:26 +0100 Subject: [PATCH] up --- .../qwenimage/modular_blocks_qwenimage.py | 47 +++++++------ .../modular_blocks_qwenimage_edit.py | 29 ++++---- .../modular_blocks_qwenimage_edit_plus.py | 24 +++---- .../modular_blocks_qwenimage_layered.py | 69 +++++++------------ 4 files changed, 79 insertions(+), 90 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 7f18de4f99..85b77c2a6b 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -77,11 +77,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -260,8 +257,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ class QwenImageOptionalControlNetVaeEncoderStep - Vae encoder step that encode the image inputs into their latent representations. - This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. @@ -458,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -503,7 +500,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -593,7 +591,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -692,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -789,7 +789,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -897,7 +898,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -1014,7 +1016,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -1232,7 +1235,8 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1294,6 +1298,7 @@ AUTO_BLOCKS = InsertableDict( ] ) + # auto_docstring class QwenImageAutoBlocks(SequentialPipelineBlocks): """ @@ -1301,7 +1306,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage. - for image-to-image generation, you need to provide `image` - - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` - to run the controlnet workflow, you need to provide `control_image` - for text-to-image generation, all you need to provide is `prompt` @@ -1332,11 +1337,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -1428,6 +1430,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = AUTO_BLOCKS.values() @@ -1438,7 +1441,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): return ( "Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n" + "- for image-to-image generation, you need to provide `image`\n" - + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n" + + "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`.\n" + "- to run the controlnet workflow, you need to provide `control_image`\n" + "- for text-to-image generation, all you need to provide is `prompt`" ) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 91efe9dda2..3fcbc8853f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -76,11 +76,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -424,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -733,7 +733,8 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -801,6 +802,7 @@ EDIT_AUTO_BLOCKS = InsertableDict( ] ) + # auto_docstring class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ @@ -808,7 +810,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: @@ -835,11 +838,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -904,6 +906,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = EDIT_AUTO_BLOCKS.values() block_names = EDIT_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 3a780daf96..0364e394d2 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -130,8 +129,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVaeEncoderStep - VAE encoder step that encodes image inputs into latent representations. - Each image is resized independently based on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based + on its own aspect ratio to 1024x1024 target area. Components: @@ -397,6 +396,7 @@ EDIT_PLUS_AUTO_BLOCKS = InsertableDict( ] ) + # auto_docstring class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): """ @@ -430,11 +430,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -486,6 +485,7 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-edit-plus" block_classes = EDIT_PLUS_AUTO_BLOCKS.values() block_names = EDIT_PLUS_AUTO_BLOCKS.keys() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 7cb5cd7a1c..5602fc9b93 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -55,7 +55,8 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredTextEncoderStep - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -72,28 +73,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -103,16 +99,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -399,6 +390,7 @@ LAYERED_AUTO_BLOCKS = InsertableDict( ] ) + # auto_docstring class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): """ @@ -431,28 +423,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -462,16 +449,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -529,6 +511,7 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-layered" block_classes = LAYERED_AUTO_BLOCKS.values() block_names = LAYERED_AUTO_BLOCKS.keys()