From 2a81f2ec5417efdc7773937dd7db2f675a46b66a Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 12:15:36 +0100 Subject: [PATCH] style --- .../qwenimage/modular_blocks_qwenimage.py | 86 ++++++++++++------- .../modular_blocks_qwenimage_edit.py | 46 ++++++---- .../modular_blocks_qwenimage_edit_plus.py | 26 +++--- .../modular_blocks_qwenimage_layered.py | 47 +++++----- 4 files changed, 116 insertions(+), 89 deletions(-) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 19feffe77e..d54dca5f5a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -58,7 +58,8 @@ logger = logging.get_logger(__name__) # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): """ class QwenImageAutoTextEncoderStep @@ -76,11 +77,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -111,6 +109,7 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask """ + model_name = "qwenimage" block_classes = [QwenImageTextEncoderStep()] block_names = ["text_encoder"] @@ -127,7 +126,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): # 2. VAE ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageInpaintVaeEncoderStep @@ -175,6 +175,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage" block_classes = [QwenImageInpaintProcessImagesInputStep(), QwenImageVaeEncoderStep()] block_names = ["preprocess", "encode"] @@ -189,7 +190,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): ) -#auto_docstring +# auto_docstring class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgVaeEncoderStep @@ -223,6 +224,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage" block_classes = [QwenImageProcessImagesInputStep(), QwenImageVaeEncoderStep()] @@ -250,13 +252,12 @@ class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks): # optional controlnet vae encoder -#auto_docstring +# auto_docstring class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): """ class QwenImageOptionalControlNetVaeEncoderStep - Vae encoder step that encode the image inputs into their latent representations. - This is an auto pipeline block. + Vae encoder step that encode the image inputs into their latent representations. This is an auto pipeline block. - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided. - if `control_image` is not provided, step will be skipped. @@ -287,6 +288,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): control_image_latents (`Tensor`): The latents representing the control image """ + block_classes = [QwenImageControlNetVaeEncoderStep] block_names = ["controlnet"] block_trigger_inputs = ["control_image"] @@ -307,7 +309,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgInputStep @@ -353,6 +355,7 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage" block_classes = [QwenImageTextInputsStep(), QwenImageAdditionalInputsStep(image_latent_inputs=["image_latents"])] block_names = ["text_inputs", "additional_inputs"] @@ -364,7 +367,7 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): " - update height/width based `image_latents`, patchify `image_latents`." -#auto_docstring +# auto_docstring class QwenImageInpaintInputStep(SequentialPipelineBlocks): """ class QwenImageInpaintInputStep @@ -412,6 +415,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage" block_classes = [ QwenImageTextInputsStep(), @@ -429,7 +433,7 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): # assemble prepare latents steps -#auto_docstring +# auto_docstring class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ class QwenImageInpaintPrepareLatentsStep @@ -450,7 +454,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -472,6 +477,7 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -489,12 +495,13 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): # Qwen Image (text2image) -#auto_docstring +# auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -570,20 +577,22 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Qwen Image (inpainting) -#auto_docstring +# auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -675,13 +684,15 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): OutputParam.latents(), ] + # Qwen Image (image2image) -#auto_docstring +# auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -771,13 +782,15 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): OutputParam.latents(), ] + # Qwen Image (text2image) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetCoreDenoiseStep - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the + inputs (timesteps, latents, rope inputs etc.). Components: @@ -871,20 +884,22 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.)." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Qwen Image (inpainting) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetInpaintCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + inpaint task. Components: @@ -996,12 +1011,13 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): # Qwen Image (image2image) with controlnet -#auto_docstring +# auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageControlNetImg2ImgCoreDenoiseStep - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for + img2img task. Components: @@ -1102,13 +1118,14 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): @property def description(self): return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task." - + @property def outputs(self): return [ OutputParam.latents(), ] + # Auto denoise step for QwenImage class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): block_classes = [ @@ -1176,7 +1193,7 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): # standard decode step works for most tasks except for inpaint -#auto_docstring +# auto_docstring class QwenImageDecodeStep(SequentialPipelineBlocks): """ class QwenImageDecodeStep @@ -1202,6 +1219,7 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -1212,12 +1230,13 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): # Inpaint decode step -#auto_docstring +# auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1240,6 +1259,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index cae6236eb5..37a438ea1f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -55,7 +55,8 @@ logger = logging.get_logger(__name__) # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditVLEncoderStep @@ -75,11 +76,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -130,7 +130,7 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): # Edit VAE encoder -#auto_docstring +# auto_docstring class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditVaeEncoderStep @@ -163,6 +163,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -177,7 +178,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): # Edit Inpaint VAE encoder -#auto_docstring +# auto_docstring class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintVaeEncoderStep @@ -224,6 +225,7 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditResizeStep(), @@ -265,7 +267,7 @@ class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageEditInputStep(SequentialPipelineBlocks): """ class QwenImageEditInputStep @@ -313,6 +315,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -329,7 +332,7 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): ) -#auto_docstring +# auto_docstring class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintInputStep @@ -379,6 +382,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): image_width (`int`): The image width calculated from the image latents dimension """ + model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), @@ -398,7 +402,7 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): # assemble prepare latents steps -#auto_docstring +# auto_docstring class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintPrepareLatentsStep @@ -419,7 +423,8 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -441,6 +446,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage-edit" block_classes = [QwenImagePrepareLatentsWithStrengthStep(), QwenImageCreateMaskLatentsStep()] block_names = ["add_noise_to_latents", "create_mask_latents"] @@ -455,7 +461,7 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): # Qwen Image Edit (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditCoreDenoiseStep @@ -547,7 +553,7 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): # Qwen Image Edit (inpainting) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintCoreDenoiseStep @@ -671,20 +677,21 @@ class QwenImageEditAutoCoreDenoiseStep(ConditionalPipelineBlocks): " - `QwenImageEditCoreDenoiseStep` when `image_latents` is provided\n" "Supports edit (img2img) and edit inpainting tasks for QwenImage-Edit." ) - + @property def outputs(self): return [ OutputParam.latents(), ] + # ==================== # 4. DECODE # ==================== # Decode step (standard) -#auto_docstring +# auto_docstring class QwenImageEditDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditDecodeStep @@ -710,6 +717,7 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] @@ -720,12 +728,13 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): # Inpaint decode step -#auto_docstring +# auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditInpaintDecodeStep - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -748,6 +757,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-edit" block_classes = [QwenImageDecoderStep(), QwenImageInpaintProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 2fcd633f0d..851b69f232 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -49,7 +49,7 @@ logger = logging.get_logger(__name__) # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVLEncoderStep @@ -69,11 +69,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -125,13 +124,13 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageEditPlusVaeEncoderStep - VAE encoder step that encodes image inputs into latent representations. - Each image is resized independently based on its own aspect ratio to 1024x1024 target area. + VAE encoder step that encodes image inputs into latent representations. Each image is resized independently based + on its own aspect ratio to 1024x1024 target area. Components: @@ -182,7 +181,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageEditPlusInputStep(SequentialPipelineBlocks): """ class QwenImageEditPlusInputStep @@ -232,6 +231,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): image_width (`List`): The image widths calculated from the image latents dimension """ + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageTextInputsStep(), @@ -251,7 +251,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): # Qwen Image Edit Plus (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageEditPlusCoreDenoiseStep @@ -312,6 +312,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit-plus" block_classes = [ QwenImageEditPlusInputStep(), @@ -346,7 +347,7 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): # ==================== -#auto_docstring +# auto_docstring class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): """ class QwenImageEditPlusDecodeStep @@ -372,6 +373,7 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-edit-plus" block_classes = [QwenImageDecoderStep(), QwenImageProcessImagesOutputStep()] block_names = ["decode", "postprocess"] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index f647f16868..56fa1345a5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -49,12 +49,14 @@ logger = logging.get_logger(__name__) # 1. TEXT ENCODER # ==================== -#auto_docstring + +# auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredTextEncoderStep - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -71,28 +73,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -102,16 +99,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -174,7 +166,7 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): # Edit VAE encoder -#auto_docstring +# auto_docstring class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): """ class QwenImageLayeredVaeEncoderStep @@ -210,6 +202,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredResizeStep(), @@ -230,7 +223,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): # assemble input steps -#auto_docstring +# auto_docstring class QwenImageLayeredInputStep(SequentialPipelineBlocks): """ class QwenImageLayeredInputStep @@ -278,6 +271,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): width (`int`): The width of the image output """ + model_name = "qwenimage-layered" block_classes = [ QwenImageTextInputsStep(), @@ -295,7 +289,7 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): # Qwen Image Layered (image2image) core denoise step -#auto_docstring +# auto_docstring class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): """ class QwenImageLayeredCoreDenoiseStep @@ -353,6 +347,7 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageLayeredInputStep(),