1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00

make style

This commit is contained in:
yiyixuxu
2026-01-19 09:27:40 +01:00
parent 8d45ff5bf6
commit f056af1fbb
10 changed files with 497 additions and 432 deletions

View File

@@ -438,7 +438,7 @@ INPUT_PARAM_TEMPLATES = {
"description": "Number of layers to extract from the image",
},
# common intermediate inputs
"prompt_embeds":{
"prompt_embeds": {
"type_hint": torch.Tensor,
"required": True,
"description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.",
@@ -531,16 +531,16 @@ class InputParam:
raise ValueError(f"InputParam template for {template_name} not found")
template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy()
# Determine the actual param name:
# 1. From overrides if provided
# 2. From template if present
# 3. Fall back to template_name
name = overrides.pop("name", template_kwargs.pop("name", template_name))
if note and "description" in template_kwargs:
template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
template_kwargs.update(overrides)
return cls(name=name, **template_kwargs)
@@ -564,18 +564,18 @@ class OutputParam:
"""Get template for name if exists, otherwise raise ValueError."""
if template_name not in OUTPUT_PARAM_TEMPLATES:
raise ValueError(f"OutputParam template for {template_name} not found")
template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy()
# Determine the actual param name:
# 1. From overrides if provided
# 2. From template if present
# 3. Fall back to template_name
name = overrides.pop("name", template_kwargs.pop("name", template_name))
if note and "description" in template_kwargs:
template_kwargs["description"] = f"{template_kwargs['description']} ({note})"
template_kwargs.update(overrides)
return cls(name=name, **template_kwargs)
@@ -913,4 +913,4 @@ def make_doc_string(
output += "\n\n"
output += format_output_params(outputs, indent_level=2)
return output
return output

View File

@@ -117,6 +117,7 @@ def get_timesteps(scheduler, num_inference_steps, strength):
# 1. PREPARE LATENTS
# ====================
# auto_docstring
class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
"""
@@ -137,8 +138,8 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
dtype (`dtype`, *optional*, defaults to torch.float32):
The dtype of the model inputs, can be generated in input step.
@@ -150,6 +151,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
latents (`Tensor`):
The initial latents to use for the denoising process
"""
model_name = "qwenimage"
@property
@@ -254,8 +256,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
generator (`Generator`, *optional*):
Torch generator for deterministic generation.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
dtype (`dtype`, *optional*, defaults to torch.float32):
The dtype of the model inputs, can be generated in input step.
@@ -267,6 +269,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
latents (`Tensor`):
The initial latents to use for the denoising process
"""
model_name = "qwenimage-layered"
@property
@@ -353,7 +356,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks):
# auto_docstring
class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
"""
Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.
Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
prepare_latents. Both noise and image latents should alreadybe patchified.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -362,8 +366,8 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
latents (`Tensor`):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
vae encoder and updated in input step.)
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
generated from vae encoder and updated in input step.)
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -373,6 +377,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
latents (`Tensor`):
The scaled noisy latents to use for inpainting/image-to-image denoising.
"""
model_name = "qwenimage"
@property
@@ -396,10 +401,10 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
),
InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."),
InputParam(
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
@@ -475,6 +480,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
mask (`Tensor`):
The mask to use for the inpainting process.
"""
model_name = "qwenimage"
@property
@@ -541,10 +547,12 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
# 2. SET TIMESTEPS
# ====================
# auto_docstring
class QwenImageSetTimestepsStep(ModularPipelineBlocks):
"""
Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.
Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents
step.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -561,6 +569,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
timesteps (`Tensor`):
The timesteps to use for the denoising process
"""
model_name = "qwenimage"
@property
@@ -579,10 +588,10 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks):
InputParam.template("num_inference_steps"),
InputParam.template("sigmas"),
InputParam(
name="latents",
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial random noised latents for the denoising process. Can be generated in prepare latents step."
description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.",
),
]
@@ -640,13 +649,14 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
sigmas (`List`, *optional*):
Custom sigmas for the denoising process.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
vae encoder and packed in input step.)
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
generated from vae encoder and packed in input step.)
Outputs:
timesteps (`Tensor`):
The timesteps to use for the denoising process.
"""
model_name = "qwenimage-layered"
@property
@@ -671,9 +681,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="timesteps",
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process."
name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process."
),
]
@@ -711,7 +719,8 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
"""
Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.
Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after
prepare latents step.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`)
@@ -732,6 +741,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
num_inference_steps (`int`):
The number of denoising steps to perform at inference time. Updated based on strength.
"""
model_name = "qwenimage"
@property
@@ -750,10 +760,10 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
InputParam.template("num_inference_steps"),
InputParam.template("sigmas"),
InputParam(
"latents",
required=True,
"latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process. Can be generated in prepare latents step."
description="The latents to use for the denoising process. Can be generated in prepare latents step.",
),
InputParam.template("strength", default=0.9),
]
@@ -815,6 +825,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
## RoPE inputs for denoiser
# auto_docstring
class QwenImageRoPEInputsStep(ModularPipelineBlocks):
"""
@@ -822,8 +833,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
Inputs:
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
height (`int`):
The height in pixels of the generated image.
width (`int`):
@@ -841,6 +852,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
negative_txt_seq_lens (`List`):
The sequence lengths of the negative prompt embeds, used for RoPE calculation
"""
model_name = "qwenimage"
@property
@@ -911,12 +923,13 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
"""
Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step
Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
prepare_latents step
Inputs:
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
image_height (`int`):
The height of the reference image. Can be generated in input step.
image_width (`int`):
@@ -938,6 +951,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
negative_txt_seq_lens (`List`):
The sequence lengths of the negative prompt embeds, used for RoPE calculation
"""
model_name = "qwenimage"
@property
@@ -948,8 +962,18 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam.template("batch_size"),
InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."),
InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."),
InputParam(
name="image_height",
required=True,
type_hint=int,
description="The height of the reference image. Can be generated in input step.",
),
InputParam(
name="image_width",
required=True,
type_hint=int,
description="The width of the reference image. Can be generated in input step.",
),
InputParam.template("height", required=True),
InputParam.template("width", required=True),
InputParam.template("prompt_embeds_mask"),
@@ -1016,13 +1040,13 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
"""
Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
Should be placed after prepare_latents step.
Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
after prepare_latents step.
Inputs:
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
image_height (`List`):
The heights of the reference images. Can be generated in input step.
image_width (`List`):
@@ -1044,6 +1068,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
negative_txt_seq_lens (`List`):
The sequence lengths of the negative prompt embeds, used for RoPE calculation
"""
model_name = "qwenimage-edit-plus"
@property
@@ -1058,8 +1083,18 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam.template("batch_size"),
InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."),
InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."),
InputParam(
name="image_height",
required=True,
type_hint=List[int],
description="The heights of the reference images. Can be generated in input step.",
),
InputParam(
name="image_width",
required=True,
type_hint=List[int],
description="The widths of the reference images. Can be generated in input step.",
),
InputParam.template("height", required=True),
InputParam.template("width", required=True),
InputParam.template("prompt_embeds_mask"),
@@ -1126,8 +1161,8 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
Inputs:
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
layers (`int`, *optional*, defaults to 4):
Number of layers to extract from the image
height (`int`):
@@ -1149,6 +1184,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
additional_t_cond (`Tensor`):
The additional t cond, used for RoPE calculation
"""
model_name = "qwenimage-layered"
@property
@@ -1231,6 +1267,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks):
## ControlNet inputs for denoiser
# auto_docstring
class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
"""
@@ -1247,7 +1284,8 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
Scale for ControlNet conditioning.
control_image_latents (`Tensor`):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
@@ -1255,6 +1293,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
controlnet_keep (`List`):
The controlnet keep values
"""
model_name = "qwenimage"
@property
@@ -1274,16 +1313,16 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
InputParam.template("control_guidance_end"),
InputParam.template("controlnet_conditioning_scale"),
InputParam(
name="control_image_latents",
required=True,
type_hint=torch.Tensor,
description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."
name="control_image_latents",
required=True,
type_hint=torch.Tensor,
description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
),
InputParam(
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]

View File

@@ -30,10 +30,12 @@ logger = logging.get_logger(__name__)
# after denoising loop (unpack latents)
#auto_docstring
# auto_docstring
class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
"""
Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)
Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
channels, 1, height, width)
Components:
pachifier (`QwenImagePachifier`)
@@ -50,6 +52,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
latents (`Tensor`):
The denoisedlatents unpacked to B, C, 1, H, W
"""
model_name = "qwenimage"
@property
@@ -70,10 +73,10 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
InputParam.template("height", required=True),
InputParam.template("width", required=True),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to decode, can be generated in the denoise step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to decode, can be generated in the denoise step.",
),
]
@@ -81,9 +84,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="latents",
type_hint=torch.Tensor,
description="The denoisedlatents unpacked to B, C, 1, H, W"
name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W"
),
]
@@ -100,7 +101,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
return components, state
#auto_docstring
# auto_docstring
class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
"""
Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.
@@ -122,6 +123,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
latents (`Tensor`):
Denoised latents. (unpacked to B, C, layers+1, H, W)
"""
model_name = "qwenimage-layered"
@property
@@ -138,10 +140,10 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step.",
),
InputParam.template("height", required=True),
InputParam.template("width", required=True),
@@ -173,7 +175,8 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks):
# decode step
#auto_docstring
# auto_docstring
class QwenImageDecoderStep(ModularPipelineBlocks):
"""
Step that decodes the latents to images
@@ -183,12 +186,14 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
Outputs:
images (`List`):
Generated images. (tensor output of the vae decoder.)
"""
model_name = "qwenimage"
@property
@@ -207,10 +212,10 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
),
]
@@ -246,18 +251,18 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
return components, state
#auto_docstring
# auto_docstring
class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
"""
Decode unpacked latents (B, C, layers+1, H, W) into layer images.
Components:
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
@@ -265,6 +270,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage-layered"
@property
@@ -287,10 +293,10 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.",
),
InputParam.template("output_type"),
]
@@ -345,7 +351,8 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks):
# postprocess the decoded images
#auto_docstring
# auto_docstring
class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
"""
postprocess the generated image
@@ -363,6 +370,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
images (`List`):
Generated images.
"""
model_name = "qwenimage"
@property
@@ -384,10 +392,10 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="images",
required=True,
type_hint=torch.Tensor,
description="the generated image tensor from decoders step"
name="images",
required=True,
type_hint=torch.Tensor,
description="the generated image tensor from decoders step",
),
InputParam.template("output_type"),
]
@@ -416,7 +424,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
return components, state
#auto_docstring
# auto_docstring
class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
"""
postprocess the generated image, optional apply the mask overally to the original image..
@@ -430,12 +438,14 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
mask_overlay_kwargs (`Dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
images (`List`):
Generated images.
"""
model_name = "qwenimage"
@property
@@ -457,16 +467,17 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="images",
required=True,
type_hint=torch.Tensor,
description="the generated image tensor from decoders step"
name="images",
required=True,
type_hint=torch.Tensor,
description="the generated image tensor from decoders step",
),
InputParam.template("output_type"),
InputParam(
name="mask_overlay_kwargs",
name="mask_overlay_kwargs",
type_hint=Dict[str, Any],
description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."),
description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.",
),
]
@property

View File

@@ -50,10 +50,10 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
),
]
@@ -80,10 +80,10 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step."
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
),
InputParam.template("image_latents"),
]
@@ -131,10 +131,10 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
),
InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."),
InputParam(
name="controlnet_keep",
required=True,
type_hint=List[float],
description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step."
name="controlnet_keep",
required=True,
type_hint=List[float],
description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.",
),
]
@@ -467,10 +467,10 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
def loop_inputs(self) -> List[InputParam]:
return [
InputParam(
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step."
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam.template("num_inference_steps", required=True),
]
@@ -505,21 +505,21 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
# Qwen Image (text2image, image2image)
# auto_docstring
class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageLoopBeforeDenoiser`
- `QwenImageLoopDenoiser`
- `QwenImageLoopAfterDenoiser`
This block supports text2image and image2image tasks for QwenImage.
Components:
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -539,6 +539,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage"
block_classes = [
@@ -551,8 +552,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"Denoise step that iteratively denoise the latents.\n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageLoopBeforeDenoiser`\n"
" - `QwenImageLoopDenoiser`\n"
@@ -565,9 +566,9 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageLoopBeforeDenoiser`
- `QwenImageLoopDenoiser`
- `QwenImageLoopAfterDenoiser`
@@ -575,9 +576,8 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
This block supports inpainting tasks for QwenImage.
Components:
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -603,6 +603,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage"
block_classes = [
QwenImageLoopBeforeDenoiser,
@@ -630,9 +631,9 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageLoopBeforeDenoiser`
- `QwenImageLoopBeforeDenoiserControlNet`
- `QwenImageLoopDenoiser`
@@ -640,10 +641,8 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
This block supports text2img/img2img tasks with controlnet for QwenImage.
Components:
guider (`ClassifierFreeGuidance`)
controlnet (`QwenImageControlNetModel`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
(`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -669,6 +668,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage"
block_classes = [
QwenImageLoopBeforeDenoiser,
@@ -696,9 +696,9 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageLoopBeforeDenoiser`
- `QwenImageLoopBeforeDenoiserControlNet`
- `QwenImageLoopDenoiser`
@@ -707,10 +707,8 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
This block supports inpainting tasks with controlnet for QwenImage.
Components:
guider (`ClassifierFreeGuidance`)
controlnet (`QwenImageControlNetModel`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer
(`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -742,6 +740,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage"
block_classes = [
QwenImageLoopBeforeDenoiser,
@@ -777,18 +776,17 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageEditLoopBeforeDenoiser`
- `QwenImageEditLoopDenoiser`
- `QwenImageLoopAfterDenoiser`
This block supports QwenImage Edit.
Components:
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -810,6 +808,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditLoopBeforeDenoiser,
@@ -835,9 +834,9 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageEditLoopBeforeDenoiser`
- `QwenImageEditLoopDenoiser`
- `QwenImageLoopAfterDenoiser`
@@ -845,9 +844,8 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
This block supports inpainting tasks for QwenImage Edit.
Components:
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -873,6 +871,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditLoopBeforeDenoiser,
@@ -900,18 +899,17 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
# auto_docstring
class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
"""
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method
At each iteration, it runs blocks defined in `sub_blocks` sequencially:
Denoise step that iteratively denoise the latents.
Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks
defined in `sub_blocks` sequencially:
- `QwenImageEditLoopBeforeDenoiser`
- `QwenImageEditLoopDenoiser`
- `QwenImageLoopAfterDenoiser`
This block supports QwenImage Layered.
Components:
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler
(`FlowMatchEulerDiscreteScheduler`)
Inputs:
timesteps (`Tensor`):
@@ -933,6 +931,7 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper):
latents (`Tensor`):
Denoised latents.
"""
model_name = "qwenimage-layered"
block_classes = [
QwenImageEditLoopBeforeDenoiser,

View File

@@ -30,7 +30,7 @@ from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions
from ...utils import logging
from ...utils.torch_utils import unwrap_module
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline
from .prompt_templates import (
QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE,
@@ -277,6 +277,7 @@ def encode_vae_image(
# In most of our other pipelines, resizing is done as part of the image preprocessing step.
# ====================
# auto_docstring
class QwenImageEditResizeStep(ModularPipelineBlocks):
"""
@@ -293,8 +294,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
resized_image (`List`):
The resized images
"""
model_name = "qwenimage-edit"
model_name = "qwenimage-edit"
@property
def description(self) -> str:
@@ -319,8 +320,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="resized_image",
type_hint=List[PIL.Image.Image],
name="resized_image",
type_hint=List[PIL.Image.Image],
description="The resized images",
),
]
@@ -353,7 +354,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageLayeredResizeStep(ModularPipelineBlocks):
"""
Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.
Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while
maintaining the aspect ratio.
Components:
image_resize_processor (`VaeImageProcessor`)
@@ -368,11 +370,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
resized_image (`List`):
The resized images
"""
model_name = "qwenimage-layered"
@property
def description(self) -> str:
return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio."
@property
def expected_components(self) -> List[ComponentSpec]:
@@ -399,11 +402,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(
name="resized_image",
type_hint=List[PIL.Image.Image],
description="The resized images",
)]
return [
OutputParam(
name="resized_image",
type_hint=List[PIL.Image.Image],
description="The resized images",
)
]
@staticmethod
def check_inputs(resolution: int):
@@ -442,8 +447,8 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks):
class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
"""
Resize images for QwenImage Edit Plus pipeline.
Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
Each image is resized independently based on its own aspect ratio.
Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text
encoding. Each image is resized independently based on its own aspect ratio.
Components:
image_resize_processor (`VaeImageProcessor`)
@@ -484,7 +489,7 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
# image
return [InputParam.template("image")]
return [InputParam.template("image")]
@property
def intermediate_outputs(self) -> List[OutputParam]:
@@ -518,13 +523,11 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
resized_cond_images = []
for image in images:
image_width, image_height = image.size
# For VAE encoder (1024x1024 target area)
vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
resized_images.append(
components.image_resize_processor.resize(image, height=vae_height, width=vae_width)
)
resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width))
# For VL text encoder (384x384 target area)
vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height)
resized_cond_images.append(
@@ -541,16 +544,16 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks):
# 2. GET IMAGE PROMPT
# ====================
# auto_docstring
class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
"""
Auto-caption step that generates a text prompt from the input image if none is provided.
Uses the VL model (text_encoder) to generate a description of the image.
If prompt is already provided, this step passes through unchanged.
Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step
passes through unchanged.
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`)
Inputs:
prompt (`str`, *optional*):
@@ -590,7 +593,9 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines
InputParam.template(
"prompt", required=False
), # it is not required for qwenimage-layered, unlike other pipelines
InputParam(
name="resized_image",
required=True,
@@ -653,15 +658,15 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks):
# 3. TEXT ENCODER
# ====================
# auto_docstring
class QwenImageTextEncoderStep(ModularPipelineBlocks):
"""
Text Encoder step that generates text embeddings to guide the image generation.
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
The tokenizer to use guider (`ClassifierFreeGuidance`)
Inputs:
prompt (`str`):
@@ -681,6 +686,7 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask.
"""
model_name = "qwenimage"
def __init__(self):
@@ -706,7 +712,6 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
),
]
@property
def inputs(self) -> List[InputParam]:
return [
@@ -786,12 +791,12 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
"""
Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.
Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image
generation.
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
(`ClassifierFreeGuidance`)
Inputs:
prompt (`str`):
@@ -811,6 +816,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask.
"""
model_name = "qwenimage"
def __init__(self):
@@ -835,7 +841,6 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
),
]
@property
def inputs(self) -> List[InputParam]:
return [
@@ -909,12 +914,12 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
"""
Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.
Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text
embeddings for guiding image generation.
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
(`ClassifierFreeGuidance`)
Inputs:
prompt (`str`):
@@ -922,7 +927,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
negative_prompt (`str`, *optional*):
The prompt or prompts not to guide the image generation.
resized_cond_image (`Tensor`):
The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step
The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using
resize step
Outputs:
prompt_embeds (`Tensor`):
@@ -963,7 +969,6 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
),
]
@property
def inputs(self) -> List[InputParam]:
return [
@@ -1042,10 +1047,12 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks):
# 4. IMAGE PREPROCESS
# ====================
# auto_docstring
class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
"""
Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.
Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be
resized to the given height and width.
Components:
image_mask_processor (`InpaintProcessor`)
@@ -1070,6 +1077,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
mask_overlay_kwargs (`Dict`):
The kwargs for the postprocess step to apply the mask overlay
"""
model_name = "qwenimage"
@property
@@ -1152,7 +1160,8 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
"""
Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.
Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be
resized first.
Components:
image_mask_processor (`InpaintProcessor`)
@@ -1173,6 +1182,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
mask_overlay_kwargs (`Dict`):
The kwargs for the postprocess step to apply the mask overlay
"""
model_name = "qwenimage-edit"
@property
@@ -1206,11 +1216,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image"
),
OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"),
OutputParam(
name="processed_mask_image",
type_hint=torch.Tensor,
@@ -1263,6 +1269,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
processed_image (`Tensor`):
The processed image
"""
model_name = "qwenimage"
@property
@@ -1290,11 +1297,13 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)]
return [
OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
@@ -1340,6 +1349,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
processed_image (`Tensor`):
The processed image
"""
model_name = "qwenimage-edit"
@property
@@ -1361,7 +1371,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="resized_image",
name="resized_image",
required=True,
type_hint=List[PIL.Image.Image],
description="The resized image. should be generated using a resize step",
@@ -1370,11 +1380,13 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)]
return [
OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1395,7 +1407,8 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks):
# auto_docstring
class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
"""
Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.
Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of
processed images.
Components:
image_processor (`VaeImageProcessor`)
@@ -1408,6 +1421,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
processed_image (`Tensor`):
The processed image
"""
model_name = "qwenimage-edit-plus"
@property
@@ -1427,20 +1441,24 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [InputParam(
name="resized_image",
required=True,
type_hint=List[PIL.Image.Image],
description="The resized image. should be generated using a resize step",
)]
return [
InputParam(
name="resized_image",
required=True,
type_hint=List[PIL.Image.Image],
description="The resized image. should be generated using a resize step",
)
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)]
return [
OutputParam(
name="processed_image",
type_hint=torch.Tensor,
description="The processed image",
)
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
@@ -1472,6 +1490,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks):
# 5. VAE ENCODER
# ====================
# auto_docstring
class QwenImageVaeEncoderStep(ModularPipelineBlocks):
"""
@@ -1509,7 +1528,9 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
"""
if input is None:
input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode")
input = InputParam(
name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode"
)
if output is None:
output = OutputParam.template("image_latents")
@@ -1539,13 +1560,13 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
self._input, # default is "processed_image"
self._input, # default is "processed_image"
InputParam.template("generator"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [self._output] # default is "image_latents"
return [self._output] # default is "image_latents"
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
@@ -1588,9 +1609,8 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
VAE Encoder step that converts `control_image` into latent representations control_image_latents.
Components:
vae (`AutoencoderKLQwenImage`)
controlnet (`QwenImageControlNetModel`)
control_image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
(`VaeImageProcessor`)
Inputs:
control_image (`Image`):
@@ -1606,6 +1626,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
control_image_latents (`Tensor`):
The latents representing the control image
"""
model_name = "qwenimage"
@property
@@ -1720,6 +1741,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
# 6. PERMUTE LATENTS
# ====================
# auto_docstring
class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
"""
@@ -1733,11 +1755,12 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
image_latents (`Tensor`):
The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
"""
model_name = "qwenimage-layered"
@property
def description(self) -> str:
return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing."
@property
def inputs(self) -> List[InputParam]:
@@ -1760,4 +1783,4 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks):
block_state.image_latents = latents.permute(0, 2, 1, 3, 4)
self.set_block_state(state, block_state)
return components, state
return components, state

View File

@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple, Optional
from typing import List, Optional, Tuple
import torch
@@ -117,7 +117,8 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
1. Determines `batch_size` and `dtype` based on `prompt_embeds`
2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)
This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.
This block should be placed after all encoder steps to process the text embeddings before they are used in
subsequent pipeline steps.
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -145,6 +146,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks):
negative_prompt_embeds_mask (`Tensor`):
The negative prompt embeddings mask. (batch-expanded)
"""
model_name = "qwenimage"
@property
@@ -271,8 +273,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
@@ -300,7 +302,7 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
self,
image_latent_inputs: Optional[List[InputParam]] = None,
additional_batch_inputs: Optional[List[InputParam]] = None,
):
):
# by default, process `image_latents`
if image_latent_inputs is None:
image_latent_inputs = [InputParam.template("image_latents")]
@@ -319,7 +321,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
else:
for input_param in additional_batch_inputs:
if not isinstance(input_param, InputParam):
raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
raise ValueError(
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
)
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
@@ -376,13 +380,17 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks):
name="image_width",
type_hint=int,
description="The image width calculated from the image latents dimension",
)
),
]
# `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided
if len(self._image_latent_inputs) > 0:
outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
outputs.append(
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
)
outputs.append(
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
)
# image latent inputs are modified in place (patchified and batch-expanded)
for input_param in self._image_latent_inputs:
@@ -479,8 +487,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
@@ -526,7 +534,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
else:
for input_param in additional_batch_inputs:
if not isinstance(input_param, InputParam):
raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
raise ValueError(
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
)
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
@@ -587,11 +597,15 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
description="The image widths calculated from the image latents dimension",
),
]
# `height`/`width` are updated if any image latent inputs are provided
if len(self._image_latent_inputs) > 0:
outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
outputs.append(
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
)
outputs.append(
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
)
# image latent inputs are modified in place (patchified, concatenated, and batch-expanded)
for input_param in self._image_latent_inputs:
@@ -686,11 +700,13 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks):
# same as QwenImageAdditionalInputsStep, but with layered pachifier.
# auto_docstring
class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
"""
Input processing step for Layered that:
1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
size
2. For additional batch inputs: Expands batch dimensions to match final batch size
Configured inputs:
@@ -705,8 +721,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
@@ -720,8 +736,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
width (`int`):
if not provided, updated to image width
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
pachifier and batch-expanded)
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
with layered pachifier and batch-expanded)
"""
model_name = "qwenimage-layered"
@@ -748,7 +764,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
else:
for input_param in additional_batch_inputs:
if not isinstance(input_param, InputParam):
raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}")
raise ValueError(
f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
)
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
@@ -808,8 +826,12 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks):
]
if len(self._image_latent_inputs) > 0:
outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height"))
outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width"))
outputs.append(
OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")
)
outputs.append(
OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")
)
# Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded)
for input_param in self._image_latent_inputs:
@@ -895,10 +917,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
Inputs:
control_image_latents (`Tensor`):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
batch_size (`int`, *optional*, defaults to 1):
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be
generated in input step.
Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
be generated in input step.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
height (`int`, *optional*):
@@ -914,6 +937,7 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
width (`int`):
if not provided, updated to control image width
"""
model_name = "qwenimage"
@property
@@ -923,17 +947,26 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."),
InputParam(
name="control_image_latents",
required=True,
type_hint=torch.Tensor,
description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.",
),
InputParam.template("batch_size"),
InputParam.template("num_images_per_prompt"),
InputParam.template("height"),
InputParam.template("width"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."),
OutputParam(
name="control_image_latents",
type_hint=torch.Tensor,
description="The control image latents (patchified and batch-expanded).",
),
OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"),
OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"),
]

View File

@@ -13,9 +13,10 @@
# limitations under the License.
import torch
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
from .before_denoise import (
QwenImageControlNetBeforeDenoiserStep,
QwenImageCreateMaskLatentsStep,
@@ -65,9 +66,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks):
Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
The tokenizer to use guider (`ClassifierFreeGuidance`)
Inputs:
prompt (`str`, *optional*):
@@ -114,8 +114,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
- Creates `image_latents`.
Components:
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`)
image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)
Inputs:
mask_image (`Image`):
@@ -162,8 +161,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
Vae encoder step that preprocess andencode the image inputs into their latent representations.
Components:
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)
Inputs:
image (`Union[Image, List]`):
@@ -218,9 +216,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
- if `control_image` is not provided, step will be skipped.
Components:
vae (`AutoencoderKLQwenImage`)
controlnet (`QwenImageControlNetModel`)
control_image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
(`VaeImageProcessor`)
Inputs:
control_image (`Image`, *optional*):
@@ -380,7 +377,9 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks):
block_classes = [
QwenImageTextInputsStep(),
QwenImageAdditionalInputsStep(
additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
additional_batch_inputs=[
InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
]
),
]
block_names = ["text_inputs", "additional_inputs"]
@@ -401,15 +400,14 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
- Create the pachified latents `mask` based on the processedmask image.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`)
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
Inputs:
latents (`Tensor`):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
vae encoder and updated in input step.)
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
generated from vae encoder and updated in input step.)
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
processed_mask_image (`Tensor`):
@@ -450,13 +448,12 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
"""
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
(timesteps, latents, rope inputs etc.).
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -524,13 +521,12 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -606,13 +602,12 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
"""
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -686,14 +681,12 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
"""
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).
step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
(timesteps, latents, rope inputs etc.).
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
controlnet (`QwenImageControlNetModel`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -707,7 +700,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
negative_prompt_embeds_mask (`Tensor`, *optional*):
mask for the negative text embeddings. Can be generated from text_encoder step.
control_image_latents (`Tensor`):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
height (`int`, *optional*):
The height in pixels of the generated image.
width (`int`, *optional*):
@@ -773,14 +767,12 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
"""
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
controlnet (`QwenImageControlNetModel`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -802,7 +794,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
processed_mask_image (`Tensor`, *optional*):
The processed mask image
control_image_latents (`Tensor`):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
@@ -868,14 +861,12 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
"""
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.
Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
controlnet (`QwenImageControlNetModel`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
(`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -895,7 +886,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks):
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step.
control_image_latents (`Tensor`):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
latents (`Tensor`, *optional*):
Pre-generated noisy latents for image generation.
generator (`Generator`, *optional*):
@@ -1030,12 +1022,12 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
Decode step that decodes the latents to images and postprocess the generated image.
Components:
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
@@ -1057,19 +1049,21 @@ class QwenImageDecodeStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
"""
Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.
Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
overally to the original image.
Components:
vae (`AutoencoderKLQwenImage`)
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
mask_overlay_kwargs (`Dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
images (`List`):
@@ -1125,17 +1119,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
- for text-to-image generation, all you need to provide is `prompt`
Components:
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`)
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
controlnet (`QwenImageControlNetModel`)
control_image_processor (`VaeImageProcessor`)
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
(`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
(`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
Inputs:
prompt (`str`, *optional*):
@@ -1185,7 +1173,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
strength (`float`, *optional*, defaults to 0.9):
Strength for img2img/inpainting.
control_image_latents (`Tensor`, *optional*):
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.
The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
step.
control_guidance_start (`float`, *optional*, defaults to 0.0):
When to start applying ControlNet.
control_guidance_end (`float`, *optional*, defaults to 1.0):
@@ -1195,7 +1184,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks):
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
mask_overlay_kwargs (`Dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
images (`List`):

View File

@@ -13,11 +13,12 @@
# limitations under the License.
from typing import Optional
import torch
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
from .before_denoise import (
QwenImageCreateMaskLatentsStep,
QwenImageEditRoPEInputsStep,
@@ -63,10 +64,8 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
QwenImage-Edit VL encoder step that encode the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
Inputs:
image (`Union[Image, List]`):
@@ -113,9 +112,8 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
(`AutoencoderKLQwenImage`)
Inputs:
image (`Union[Image, List]`):
@@ -155,9 +153,8 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
- create image latents.
Components:
image_resize_processor (`VaeImageProcessor`)
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`)
image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae
(`AutoencoderKLQwenImage`)
Inputs:
image (`Union[Image, List]`):
@@ -354,7 +351,10 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageTextInputsStep(),
QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")]
QwenImageAdditionalInputsStep(
additional_batch_inputs=[
InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")
]
),
]
block_names = ["text_inputs", "additional_inputs"]
@@ -377,15 +377,14 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks):
- Create the patchified latents `mask` based on the processed mask image.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`)
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)
Inputs:
latents (`Tensor`):
The initial random noised, can be generated in prepare latent step.
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from
vae encoder and updated in input step.)
image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
generated from vae encoder and updated in input step.)
timesteps (`Tensor`):
The timesteps to use for the denoising process. Can be generated in set_timesteps step.
processed_mask_image (`Tensor`):
@@ -426,10 +425,8 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
Core denoising workflow for QwenImage-Edit edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -502,10 +499,8 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks):
Core denoising workflow for QwenImage-Edit edit inpaint task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -623,12 +618,12 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
Decode step that decodes the latents to images and postprocess the generated image.
Components:
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
@@ -650,19 +645,21 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks):
# auto_docstring
class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks):
"""
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image.
Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask
overlay to the original image.
Components:
vae (`AutoencoderKLQwenImage`)
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
mask_overlay_kwargs (`Dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
images (`List`):
@@ -719,19 +716,14 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
"""
Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.
- for edit (img2img) generation, you need to provide `image`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop`
- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide
`padding_mask_crop`
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
image_mask_processor (`InpaintProcessor`)
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
(`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
(`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
Inputs:
image (`Union[Image, List]`):
@@ -771,7 +763,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
mask_overlay_kwargs (`Dict`, *optional*):
The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.
The kwargs for the postprocess step to apply the mask overlay. generated in
InpaintProcessImagesInputStep.
Outputs:
images (`List`):

View File

@@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
QwenImageEditPlusRoPEInputsStep,
QwenImagePrepareLatentsStep,
@@ -55,10 +54,8 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks):
QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together.
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`)
Inputs:
image (`Union[Image, List]`):
@@ -107,9 +104,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks):
Each image is resized independently based on its own aspect ratio to 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
(`AutoencoderKLQwenImage`)
Inputs:
image (`Union[Image, List]`):
@@ -231,10 +227,8 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
Core denoising workflow for QwenImage-Edit Plus edit (img2img) task.
Components:
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -311,12 +305,12 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks):
Decode step that decodes the latents to images and postprocesses the generated image.
Components:
vae (`AutoencoderKLQwenImage`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)
Inputs:
latents (`Tensor`):
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.
The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
step.
output_type (`str`, *optional*, defaults to pil):
Output format: 'pil', 'np', 'pt'.
@@ -357,14 +351,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks):
- VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area.
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
guider (`ClassifierFreeGuidance`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
pachifier (`QwenImagePachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae
(`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`)
Inputs:

View File

@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from ...utils import logging
from ..modular_pipeline import SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam
@@ -53,14 +52,12 @@ logger = logging.get_logger(__name__)
# auto_docstring
class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks):
"""
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided.
QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not
provided.
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
Inputs:
image (`Union[Image, List]`):
@@ -116,9 +113,8 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks):
Vae encoder step that encode the image inputs into their latent representations.
Components:
image_resize_processor (`VaeImageProcessor`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae
(`AutoencoderKLQwenImage`)
Inputs:
image (`Union[Image, List]`):
@@ -203,8 +199,8 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks):
width (`int`):
if not provided, updated to image width
image_latents (`Tensor`):
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered
pachifier and batch-expanded)
image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
with layered pachifier and batch-expanded)
"""
model_name = "qwenimage-layered"
@@ -230,10 +226,8 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks):
Core denoising workflow for QwenImage-Layered img2img task.
Components:
pachifier (`QwenImageLayeredPachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
guider (`ClassifierFreeGuidance`)
transformer (`QwenImageTransformer2DModel`)
pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
(`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)
Inputs:
num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -317,16 +311,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks):
Auto Modular pipeline for layered denoising tasks using QwenImage-Layered.
Components:
image_resize_processor (`VaeImageProcessor`)
text_encoder (`Qwen2_5_VLForConditionalGeneration`)
processor (`Qwen2VLProcessor`)
tokenizer (`Qwen2Tokenizer`): The tokenizer to use
guider (`ClassifierFreeGuidance`)
image_processor (`VaeImageProcessor`)
vae (`AutoencoderKLQwenImage`)
pachifier (`QwenImageLayeredPachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`)
transformer (`QwenImageTransformer2DModel`)
image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor
(`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`)
image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`)
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)
Inputs:
image (`Union[Image, List]`):