From f056af1fbb24b79c6cc5360ea782abacd63c34fd Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Mon, 19 Jan 2026 09:27:40 +0100 Subject: [PATCH] make style --- .../modular_pipeline_utils.py | 18 +- .../qwenimage/before_denoise.py | 133 ++++++++----- .../modular_pipelines/qwenimage/decoders.py | 93 +++++---- .../modular_pipelines/qwenimage/denoise.py | 123 ++++++------ .../modular_pipelines/qwenimage/encoders.py | 177 ++++++++++-------- .../modular_pipelines/qwenimage/inputs.py | 91 ++++++--- .../qwenimage/modular_blocks_qwenimage.py | 136 +++++++------- .../modular_blocks_qwenimage_edit.py | 81 ++++---- .../modular_blocks_qwenimage_edit_plus.py | 37 ++-- .../modular_blocks_qwenimage_layered.py | 40 ++-- 10 files changed, 497 insertions(+), 432 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 6f1010daf2..a57212988e 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -438,7 +438,7 @@ INPUT_PARAM_TEMPLATES = { "description": "Number of layers to extract from the image", }, # common intermediate inputs - "prompt_embeds":{ + "prompt_embeds": { "type_hint": torch.Tensor, "required": True, "description": "text embeddings used to guide the image generation. Can be generated from text_encoder step.", @@ -531,16 +531,16 @@ class InputParam: raise ValueError(f"InputParam template for {template_name} not found") template_kwargs = INPUT_PARAM_TEMPLATES[template_name].copy() - + # Determine the actual param name: # 1. From overrides if provided # 2. From template if present # 3. Fall back to template_name name = overrides.pop("name", template_kwargs.pop("name", template_name)) - + if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" - + template_kwargs.update(overrides) return cls(name=name, **template_kwargs) @@ -564,18 +564,18 @@ class OutputParam: """Get template for name if exists, otherwise raise ValueError.""" if template_name not in OUTPUT_PARAM_TEMPLATES: raise ValueError(f"OutputParam template for {template_name} not found") - + template_kwargs = OUTPUT_PARAM_TEMPLATES[template_name].copy() - + # Determine the actual param name: # 1. From overrides if provided # 2. From template if present # 3. Fall back to template_name name = overrides.pop("name", template_kwargs.pop("name", template_name)) - + if note and "description" in template_kwargs: template_kwargs["description"] = f"{template_kwargs['description']} ({note})" - + template_kwargs.update(overrides) return cls(name=name, **template_kwargs) @@ -913,4 +913,4 @@ def make_doc_string( output += "\n\n" output += format_output_params(outputs, indent_level=2) - return output \ No newline at end of file + return output diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 0b8cd0f4b2..418d927f4f 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -117,6 +117,7 @@ def get_timesteps(scheduler, num_inference_steps, strength): # 1. PREPARE LATENTS # ==================== + # auto_docstring class QwenImagePrepareLatentsStep(ModularPipelineBlocks): """ @@ -137,8 +138,8 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks): generator (`Generator`, *optional*): Torch generator for deterministic generation. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. dtype (`dtype`, *optional*, defaults to torch.float32): The dtype of the model inputs, can be generated in input step. @@ -150,6 +151,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks): latents (`Tensor`): The initial latents to use for the denoising process """ + model_name = "qwenimage" @property @@ -254,8 +256,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): generator (`Generator`, *optional*): Torch generator for deterministic generation. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. dtype (`dtype`, *optional*, defaults to torch.float32): The dtype of the model inputs, can be generated in input step. @@ -267,6 +269,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): latents (`Tensor`): The initial latents to use for the denoising process """ + model_name = "qwenimage-layered" @property @@ -353,7 +356,8 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): # auto_docstring class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): """ - Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified. + Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, + prepare_latents. Both noise and image latents should alreadybe patchified. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -362,8 +366,8 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -373,6 +377,7 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): latents (`Tensor`): The scaled noisy latents to use for inpainting/image-to-image denoising. """ + model_name = "qwenimage" @property @@ -396,10 +401,10 @@ class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks): ), InputParam.template("image_latents", note="Can be generated from vae encoder and updated in input step."), InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), ] @@ -475,6 +480,7 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks): mask (`Tensor`): The mask to use for the inpainting process. """ + model_name = "qwenimage" @property @@ -541,10 +547,12 @@ class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks): # 2. SET TIMESTEPS # ==================== + # auto_docstring class QwenImageSetTimestepsStep(ModularPipelineBlocks): """ - Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step. + Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents + step. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -561,6 +569,7 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks): timesteps (`Tensor`): The timesteps to use for the denoising process """ + model_name = "qwenimage" @property @@ -579,10 +588,10 @@ class QwenImageSetTimestepsStep(ModularPipelineBlocks): InputParam.template("num_inference_steps"), InputParam.template("sigmas"), InputParam( - name="latents", + name="latents", required=True, type_hint=torch.Tensor, - description="The initial random noised latents for the denoising process. Can be generated in prepare latents step." + description="The initial random noised latents for the denoising process. Can be generated in prepare latents step.", ), ] @@ -640,13 +649,14 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): sigmas (`List`, *optional*): Custom sigmas for the denoising process. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and packed in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and packed in input step.) Outputs: timesteps (`Tensor`): The timesteps to use for the denoising process. """ + model_name = "qwenimage-layered" @property @@ -671,9 +681,7 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="timesteps", - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process." + name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process." ), ] @@ -711,7 +719,8 @@ class QwenImageLayeredSetTimestepsStep(ModularPipelineBlocks): # auto_docstring class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): """ - Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step. + Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after + prepare latents step. Components: scheduler (`FlowMatchEulerDiscreteScheduler`) @@ -732,6 +741,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): num_inference_steps (`int`): The number of denoising steps to perform at inference time. Updated based on strength. """ + model_name = "qwenimage" @property @@ -750,10 +760,10 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): InputParam.template("num_inference_steps"), InputParam.template("sigmas"), InputParam( - "latents", - required=True, + "latents", + required=True, type_hint=torch.Tensor, - description="The latents to use for the denoising process. Can be generated in prepare latents step." + description="The latents to use for the denoising process. Can be generated in prepare latents step.", ), InputParam.template("strength", default=0.9), ] @@ -815,6 +825,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): ## RoPE inputs for denoiser + # auto_docstring class QwenImageRoPEInputsStep(ModularPipelineBlocks): """ @@ -822,8 +833,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`): The height in pixels of the generated image. width (`int`): @@ -841,6 +852,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage" @property @@ -911,12 +923,13 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks): # auto_docstring class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): """ - Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents step + Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after + prepare_latents step Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_height (`int`): The height of the reference image. Can be generated in input step. image_width (`int`): @@ -938,6 +951,7 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage" @property @@ -948,8 +962,18 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam(name="image_height", required=True, type_hint=int, description="The height of the reference image. Can be generated in input step."), - InputParam(name="image_width", required=True, type_hint=int, description="The width of the reference image. Can be generated in input step."), + InputParam( + name="image_height", + required=True, + type_hint=int, + description="The height of the reference image. Can be generated in input step.", + ), + InputParam( + name="image_width", + required=True, + type_hint=int, + description="The width of the reference image. Can be generated in input step.", + ), InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), @@ -1016,13 +1040,13 @@ class QwenImageEditRoPEInputsStep(ModularPipelineBlocks): class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): """ Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus. - Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. - Should be placed after prepare_latents step. + Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed + after prepare_latents step. Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_height (`List`): The heights of the reference images. Can be generated in input step. image_width (`List`): @@ -1044,6 +1068,7 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): negative_txt_seq_lens (`List`): The sequence lengths of the negative prompt embeds, used for RoPE calculation """ + model_name = "qwenimage-edit-plus" @property @@ -1058,8 +1083,18 @@ class QwenImageEditPlusRoPEInputsStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam.template("batch_size"), - InputParam(name="image_height", required=True, type_hint=List[int], description="The heights of the reference images. Can be generated in input step."), - InputParam(name="image_width", required=True, type_hint=List[int], description="The widths of the reference images. Can be generated in input step."), + InputParam( + name="image_height", + required=True, + type_hint=List[int], + description="The heights of the reference images. Can be generated in input step.", + ), + InputParam( + name="image_width", + required=True, + type_hint=List[int], + description="The widths of the reference images. Can be generated in input step.", + ), InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam.template("prompt_embeds_mask"), @@ -1126,8 +1161,8 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): Inputs: batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. layers (`int`, *optional*, defaults to 4): Number of layers to extract from the image height (`int`): @@ -1149,6 +1184,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): additional_t_cond (`Tensor`): The additional t cond, used for RoPE calculation """ + model_name = "qwenimage-layered" @property @@ -1231,6 +1267,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): ## ControlNet inputs for denoiser + # auto_docstring class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): """ @@ -1247,7 +1284,8 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0): Scale for ControlNet conditioning. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. @@ -1255,6 +1293,7 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): controlnet_keep (`List`): The controlnet keep values """ + model_name = "qwenimage" @property @@ -1274,16 +1313,16 @@ class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks): InputParam.template("control_guidance_end"), InputParam.template("controlnet_conditioning_scale"), InputParam( - name="control_image_latents", - required=True, - type_hint=torch.Tensor, - description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step." + name="control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.", ), InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 650bf34da7..1adbf6bdd3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -30,10 +30,12 @@ logger = logging.get_logger(__name__) # after denoising loop (unpack latents) -#auto_docstring + +# auto_docstring class QwenImageAfterDenoiseStep(ModularPipelineBlocks): """ - Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width) + Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, + channels, 1, height, width) Components: pachifier (`QwenImagePachifier`) @@ -50,6 +52,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks): latents (`Tensor`): The denoisedlatents unpacked to B, C, 1, H, W """ + model_name = "qwenimage" @property @@ -70,10 +73,10 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks): InputParam.template("height", required=True), InputParam.template("width", required=True), InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The latents to decode, can be generated in the denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The latents to decode, can be generated in the denoise step.", ), ] @@ -81,9 +84,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks): def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="latents", - type_hint=torch.Tensor, - description="The denoisedlatents unpacked to B, C, 1, H, W" + name="latents", type_hint=torch.Tensor, description="The denoisedlatents unpacked to B, C, 1, H, W" ), ] @@ -100,7 +101,7 @@ class QwenImageAfterDenoiseStep(ModularPipelineBlocks): return components, state -#auto_docstring +# auto_docstring class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): """ Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising. @@ -122,6 +123,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): latents (`Tensor`): Denoised latents. (unpacked to B, C, layers+1, H, W) """ + model_name = "qwenimage-layered" @property @@ -138,10 +140,10 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step.", ), InputParam.template("height", required=True), InputParam.template("width", required=True), @@ -173,7 +175,8 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): # decode step -#auto_docstring + +# auto_docstring class QwenImageDecoderStep(ModularPipelineBlocks): """ Step that decodes the latents to images @@ -183,12 +186,14 @@ class QwenImageDecoderStep(ModularPipelineBlocks): Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. Outputs: images (`List`): Generated images. (tensor output of the vae decoder.) """ + model_name = "qwenimage" @property @@ -207,10 +212,10 @@ class QwenImageDecoderStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.", ), ] @@ -246,18 +251,18 @@ class QwenImageDecoderStep(ModularPipelineBlocks): return components, state -#auto_docstring +# auto_docstring class QwenImageLayeredDecoderStep(ModularPipelineBlocks): """ Decode unpacked latents (B, C, layers+1, H, W) into layer images. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -265,6 +270,7 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage-layered" @property @@ -287,10 +293,10 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.", ), InputParam.template("output_type"), ] @@ -345,7 +351,8 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): # postprocess the decoded images -#auto_docstring + +# auto_docstring class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): """ postprocess the generated image @@ -363,6 +370,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): images (`List`): Generated images. """ + model_name = "qwenimage" @property @@ -384,10 +392,10 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="images", - required=True, - type_hint=torch.Tensor, - description="the generated image tensor from decoders step" + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step", ), InputParam.template("output_type"), ] @@ -416,7 +424,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): return components, state -#auto_docstring +# auto_docstring class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): """ postprocess the generated image, optional apply the mask overally to the original image.. @@ -430,12 +438,14 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): Generated images. """ + model_name = "qwenimage" @property @@ -457,16 +467,17 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="images", - required=True, - type_hint=torch.Tensor, - description="the generated image tensor from decoders step" + name="images", + required=True, + type_hint=torch.Tensor, + description="the generated image tensor from decoders step", ), InputParam.template("output_type"), InputParam( - name="mask_overlay_kwargs", + name="mask_overlay_kwargs", type_hint=Dict[str, Any], - description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep."), + description="The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.", + ), ] @property diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index ff6e411d76..3b00fcb274 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -50,10 +50,10 @@ class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", ), ] @@ -80,10 +80,10 @@ class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="latents", - required=True, - type_hint=torch.Tensor, - description="The initial latents to use for the denoising process. Can be generated in prepare_latent step." + name="latents", + required=True, + type_hint=torch.Tensor, + description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.", ), InputParam.template("image_latents"), ] @@ -131,10 +131,10 @@ class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks): ), InputParam.template("controlnet_conditioning_scale", note="updated in prepare_controlnet_inputs step."), InputParam( - name="controlnet_keep", - required=True, - type_hint=List[float], - description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step." + name="controlnet_keep", + required=True, + type_hint=List[float], + description="The controlnet keep values. Can be generated in prepare_controlnet_inputs step.", ), ] @@ -467,10 +467,10 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks): def loop_inputs(self) -> List[InputParam]: return [ InputParam( - name="timesteps", - required=True, - type_hint=torch.Tensor, - description="The timesteps to use for the denoising process. Can be generated in set_timesteps step." + name="timesteps", + required=True, + type_hint=torch.Tensor, + description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.", ), InputParam.template("num_inference_steps", required=True), ] @@ -505,21 +505,21 @@ class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks): # Qwen Image (text2image, image2image) + # auto_docstring class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports text2image and image2image tasks for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -539,6 +539,7 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ @@ -551,8 +552,8 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): @property def description(self) -> str: return ( - "Denoise step that iteratively denoise the latents. \n" - "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n" + "Denoise step that iteratively denoise the latents.\n" + "Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method\n" "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n" " - `QwenImageLoopBeforeDenoiser`\n" " - `QwenImageLoopDenoiser`\n" @@ -565,9 +566,9 @@ class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopDenoiser` - `QwenImageLoopAfterDenoiser` @@ -575,9 +576,8 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -603,6 +603,7 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -630,9 +631,9 @@ class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopBeforeDenoiserControlNet` - `QwenImageLoopDenoiser` @@ -640,10 +641,8 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports text2img/img2img tasks with controlnet for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - controlnet (`QwenImageControlNetModel`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer + (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -669,6 +668,7 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -696,9 +696,9 @@ class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageLoopBeforeDenoiser` - `QwenImageLoopBeforeDenoiserControlNet` - `QwenImageLoopDenoiser` @@ -707,10 +707,8 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks with controlnet for QwenImage. Components: - guider (`ClassifierFreeGuidance`) - controlnet (`QwenImageControlNetModel`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) controlnet (`QwenImageControlNetModel`) transformer + (`QwenImageTransformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -742,6 +740,7 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage" block_classes = [ QwenImageLoopBeforeDenoiser, @@ -777,18 +776,17 @@ class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports QwenImage Edit. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -810,6 +808,7 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -835,9 +834,9 @@ class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` @@ -845,9 +844,8 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): This block supports inpainting tasks for QwenImage Edit. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -873,6 +871,7 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-edit" block_classes = [ QwenImageEditLoopBeforeDenoiser, @@ -900,18 +899,17 @@ class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper): # auto_docstring class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): """ - Denoise step that iteratively denoise the latents. - Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method - At each iteration, it runs blocks defined in `sub_blocks` sequencially: + Denoise step that iteratively denoise the latents. + Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method At each iteration, it runs blocks + defined in `sub_blocks` sequencially: - `QwenImageEditLoopBeforeDenoiser` - `QwenImageEditLoopDenoiser` - `QwenImageLoopAfterDenoiser` This block supports QwenImage Layered. Components: - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) scheduler + (`FlowMatchEulerDiscreteScheduler`) Inputs: timesteps (`Tensor`): @@ -933,6 +931,7 @@ class QwenImageLayeredDenoiseStep(QwenImageDenoiseLoopWrapper): latents (`Tensor`): Denoised latents. """ + model_name = "qwenimage-layered" block_classes = [ QwenImageEditLoopBeforeDenoiser, diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 083ee507cc..5e1821cca5 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -30,7 +30,7 @@ from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions from ...utils import logging from ...utils.torch_utils import unwrap_module from ..modular_pipeline import ModularPipelineBlocks, PipelineState -from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam +from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam from .modular_pipeline import QwenImageModularPipeline from .prompt_templates import ( QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE, @@ -277,6 +277,7 @@ def encode_vae_image( # In most of our other pipelines, resizing is done as part of the image preprocessing step. # ==================== + # auto_docstring class QwenImageEditResizeStep(ModularPipelineBlocks): """ @@ -293,8 +294,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): resized_image (`List`): The resized images """ - model_name = "qwenimage-edit" + model_name = "qwenimage-edit" @property def description(self) -> str: @@ -319,8 +320,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): def intermediate_outputs(self) -> List[OutputParam]: return [ OutputParam( - name="resized_image", - type_hint=List[PIL.Image.Image], + name="resized_image", + type_hint=List[PIL.Image.Image], description="The resized images", ), ] @@ -353,7 +354,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): # auto_docstring class QwenImageLayeredResizeStep(ModularPipelineBlocks): """ - Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio. + Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while + maintaining the aspect ratio. Components: image_resize_processor (`VaeImageProcessor`) @@ -368,11 +370,12 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): resized_image (`List`): The resized images """ + model_name = "qwenimage-layered" @property def description(self) -> str: - return f"Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio." + return "Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio." @property def expected_components(self) -> List[ComponentSpec]: @@ -399,11 +402,13 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="resized_image", - type_hint=List[PIL.Image.Image], - description="The resized images", - )] + return [ + OutputParam( + name="resized_image", + type_hint=List[PIL.Image.Image], + description="The resized images", + ) + ] @staticmethod def check_inputs(resolution: int): @@ -442,8 +447,8 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): class QwenImageEditPlusResizeStep(ModularPipelineBlocks): """ Resize images for QwenImage Edit Plus pipeline. - Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding. - Each image is resized independently based on its own aspect ratio. + Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text + encoding. Each image is resized independently based on its own aspect ratio. Components: image_resize_processor (`VaeImageProcessor`) @@ -484,7 +489,7 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: # image - return [InputParam.template("image")] + return [InputParam.template("image")] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -518,13 +523,11 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): resized_cond_images = [] for image in images: image_width, image_height = image.size - + # For VAE encoder (1024x1024 target area) vae_width, vae_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height) - resized_images.append( - components.image_resize_processor.resize(image, height=vae_height, width=vae_width) - ) - + resized_images.append(components.image_resize_processor.resize(image, height=vae_height, width=vae_width)) + # For VL text encoder (384x384 target area) vl_width, vl_height, _ = calculate_dimensions(384 * 384, image_width / image_height) resized_cond_images.append( @@ -541,16 +544,16 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): # 2. GET IMAGE PROMPT # ==================== + # auto_docstring class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): """ Auto-caption step that generates a text prompt from the input image if none is provided. - Uses the VL model (text_encoder) to generate a description of the image. - If prompt is already provided, this step passes through unchanged. + Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step + passes through unchanged. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) Inputs: prompt (`str`, *optional*): @@ -590,7 +593,9 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam.template("prompt", required=False), # it is not required for qwenimage-layered, unlike other pipelines + InputParam.template( + "prompt", required=False + ), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -653,15 +658,15 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): # 3. TEXT ENCODER # ==================== + # auto_docstring class QwenImageTextEncoderStep(ModularPipelineBlocks): """ Text Encoder step that generates text embeddings to guide the image generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -681,6 +686,7 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. """ + model_name = "qwenimage" def __init__(self): @@ -706,7 +712,6 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks): ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -786,12 +791,12 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks): # auto_docstring class QwenImageEditTextEncoderStep(ModularPipelineBlocks): """ - Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation. + Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image + generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider + (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -811,6 +816,7 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. """ + model_name = "qwenimage" def __init__(self): @@ -835,7 +841,6 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -909,12 +914,12 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): # auto_docstring class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): """ - Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation. + Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text + embeddings for guiding image generation. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider + (`ClassifierFreeGuidance`) Inputs: prompt (`str`): @@ -922,7 +927,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): negative_prompt (`str`, *optional*): The prompt or prompts not to guide the image generation. resized_cond_image (`Tensor`): - The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize step + The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using + resize step Outputs: prompt_embeds (`Tensor`): @@ -963,7 +969,6 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): ), ] - @property def inputs(self) -> List[InputParam]: return [ @@ -1042,10 +1047,12 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): # 4. IMAGE PREPROCESS # ==================== + # auto_docstring class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width. + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be + resized to the given height and width. Components: image_mask_processor (`InpaintProcessor`) @@ -1070,6 +1077,7 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay """ + model_name = "qwenimage" @property @@ -1152,7 +1160,8 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): # auto_docstring class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first. + Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be + resized first. Components: image_mask_processor (`InpaintProcessor`) @@ -1173,6 +1182,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay """ + model_name = "qwenimage-edit" @property @@ -1206,11 +1216,7 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image" - ), + OutputParam(name="processed_image", type_hint=torch.Tensor, description="The processed image"), OutputParam( name="processed_mask_image", type_hint=torch.Tensor, @@ -1263,6 +1269,7 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage" @property @@ -1290,11 +1297,13 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @staticmethod def check_inputs(height, width, vae_scale_factor): @@ -1340,6 +1349,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage-edit" @property @@ -1361,7 +1371,7 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam( - name="resized_image", + name="resized_image", required=True, type_hint=List[PIL.Image.Image], description="The resized image. should be generated using a resize step", @@ -1370,11 +1380,13 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1395,7 +1407,8 @@ class QwenImageEditProcessImagesInputStep(ModularPipelineBlocks): # auto_docstring class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): """ - Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images. + Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of + processed images. Components: image_processor (`VaeImageProcessor`) @@ -1408,6 +1421,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): processed_image (`Tensor`): The processed image """ + model_name = "qwenimage-edit-plus" @property @@ -1427,20 +1441,24 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: - return [InputParam( - name="resized_image", - required=True, - type_hint=List[PIL.Image.Image], - description="The resized image. should be generated using a resize step", - )] + return [ + InputParam( + name="resized_image", + required=True, + type_hint=List[PIL.Image.Image], + description="The resized image. should be generated using a resize step", + ) + ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [OutputParam( - name="processed_image", - type_hint=torch.Tensor, - description="The processed image", - )] + return [ + OutputParam( + name="processed_image", + type_hint=torch.Tensor, + description="The processed image", + ) + ] @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState): @@ -1472,6 +1490,7 @@ class QwenImageEditPlusProcessImagesInputStep(ModularPipelineBlocks): # 5. VAE ENCODER # ==================== + # auto_docstring class QwenImageVaeEncoderStep(ModularPipelineBlocks): """ @@ -1509,7 +1528,9 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks): output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents". """ if input is None: - input = InputParam(name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode") + input = InputParam( + name="processed_image", required=True, type_hint=torch.Tensor, description="The image tensor to encode" + ) if output is None: output = OutputParam.template("image_latents") @@ -1539,13 +1560,13 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - self._input, # default is "processed_image" + self._input, # default is "processed_image" InputParam.template("generator"), ] @property def intermediate_outputs(self) -> List[OutputParam]: - return [self._output] # default is "image_latents" + return [self._output] # default is "image_latents" @torch.no_grad() def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState: @@ -1588,9 +1609,8 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): VAE Encoder step that converts `control_image` into latent representations control_image_latents. Components: - vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor + (`VaeImageProcessor`) Inputs: control_image (`Image`): @@ -1606,6 +1626,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): control_image_latents (`Tensor`): The latents representing the control image """ + model_name = "qwenimage" @property @@ -1720,6 +1741,7 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): # 6. PERMUTE LATENTS # ==================== + # auto_docstring class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): """ @@ -1733,11 +1755,12 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): image_latents (`Tensor`): The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W]) """ + model_name = "qwenimage-layered" @property def description(self) -> str: - return f"Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." + return "Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing." @property def inputs(self) -> List[InputParam]: @@ -1760,4 +1783,4 @@ class QwenImageLayeredPermuteLatentsStep(ModularPipelineBlocks): block_state.image_latents = latents.permute(0, 2, 1, 3, 4) self.set_block_state(state, block_state) - return components, state \ No newline at end of file + return components, state diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 0e03242e5e..818bbca5ed 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple import torch @@ -117,7 +117,8 @@ class QwenImageTextInputsStep(ModularPipelineBlocks): 1. Determines `batch_size` and `dtype` based on `prompt_embeds` 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt) - This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps. + This block should be placed after all encoder steps to process the text embeddings before they are used in + subsequent pipeline steps. Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -145,6 +146,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks): negative_prompt_embeds_mask (`Tensor`): The negative prompt embeddings mask. (batch-expanded) """ + model_name = "qwenimage" @property @@ -271,8 +273,8 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -300,7 +302,7 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): self, image_latent_inputs: Optional[List[InputParam]] = None, additional_batch_inputs: Optional[List[InputParam]] = None, - ): + ): # by default, process `image_latents` if image_latent_inputs is None: image_latent_inputs = [InputParam.template("image_latents")] @@ -319,7 +321,9 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -376,13 +380,17 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): name="image_width", type_hint=int, description="The image width calculated from the image latents dimension", - ) + ), ] # `height`/`width` are not new outputs, but they will be updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # image latent inputs are modified in place (patchified and batch-expanded) for input_param in self._image_latent_inputs: @@ -479,8 +487,8 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -526,7 +534,9 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -587,11 +597,15 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): description="The image widths calculated from the image latents dimension", ), ] - + # `height`/`width` are updated if any image latent inputs are provided if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # image latent inputs are modified in place (patchified, concatenated, and batch-expanded) for input_param in self._image_latent_inputs: @@ -686,11 +700,13 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): # same as QwenImageAdditionalInputsStep, but with layered pachifier. + # auto_docstring class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): """ Input processing step for Layered that: - 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size + 1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch + size 2. For additional batch inputs: Expands batch dimensions to match final batch size Configured inputs: @@ -705,8 +721,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. @@ -720,8 +736,8 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): width (`int`): if not provided, updated to image width image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered - pachifier and batch-expanded) + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified + with layered pachifier and batch-expanded) """ model_name = "qwenimage-layered" @@ -748,7 +764,9 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): else: for input_param in additional_batch_inputs: if not isinstance(input_param, InputParam): - raise ValueError(f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}") + raise ValueError( + f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}" + ) self._image_latent_inputs = image_latent_inputs self._additional_batch_inputs = additional_batch_inputs @@ -808,8 +826,12 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): ] if len(self._image_latent_inputs) > 0: - outputs.append(OutputParam(name="height", type_hint=int, description="if not provided, updated to image height")) - outputs.append(OutputParam(name="width", type_hint=int, description="if not provided, updated to image width")) + outputs.append( + OutputParam(name="height", type_hint=int, description="if not provided, updated to image height") + ) + outputs.append( + OutputParam(name="width", type_hint=int, description="if not provided, updated to image width") + ) # Add outputs for image latent inputs (patchified with layered pachifier and batch-expanded) for input_param in self._image_latent_inputs: @@ -895,10 +917,11 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): Inputs: control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. batch_size (`int`, *optional*, defaults to 1): - Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be - generated in input step. + Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can + be generated in input step. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. height (`int`, *optional*): @@ -914,6 +937,7 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): width (`int`): if not provided, updated to control image width """ + model_name = "qwenimage" @property @@ -923,17 +947,26 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="control_image_latents", required=True, type_hint=torch.Tensor, description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step."), + InputParam( + name="control_image_latents", + required=True, + type_hint=torch.Tensor, + description="The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.", + ), InputParam.template("batch_size"), InputParam.template("num_images_per_prompt"), InputParam.template("height"), InputParam.template("width"), ] - + @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="control_image_latents", type_hint=torch.Tensor, description="The control image latents (patchified and batch-expanded)."), + OutputParam( + name="control_image_latents", + type_hint=torch.Tensor, + description="The control image latents (patchified and batch-expanded).", + ), OutputParam(name="height", type_hint=int, description="if not provided, updated to control image height"), OutputParam(name="width", type_hint=int, description="if not provided, updated to control image width"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index b50e41bb50..5837799d34 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -13,9 +13,10 @@ # limitations under the License. import torch + from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam from .before_denoise import ( QwenImageControlNetBeforeDenoiserStep, QwenImageCreateMaskLatentsStep, @@ -65,9 +66,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: prompt (`str`, *optional*): @@ -114,8 +114,7 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): - Creates `image_latents`. Components: - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) + image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`) Inputs: mask_image (`Image`): @@ -162,8 +161,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that preprocess andencode the image inputs into their latent representations. Components: - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -218,9 +216,8 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): - if `control_image` is not provided, step will be skipped. Components: - vae (`AutoencoderKLQwenImage`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor + (`VaeImageProcessor`) Inputs: control_image (`Image`, *optional*): @@ -380,7 +377,9 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): block_classes = [ QwenImageTextInputsStep(), QwenImageAdditionalInputsStep( - additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] + additional_batch_inputs=[ + InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image") + ] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -401,15 +400,14 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the pachified latents `mask` based on the processedmask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -450,13 +448,12 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -524,13 +521,12 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -606,13 +602,12 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -686,14 +681,12 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -707,7 +700,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): negative_prompt_embeds_mask (`Tensor`, *optional*): mask for the negative text embeddings. Can be generated from text_encoder step. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -773,14 +767,12 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -802,7 +794,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): processed_mask_image (`Tensor`, *optional*): The processed mask image control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -868,14 +861,12 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - controlnet (`QwenImageControlNetModel`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet + (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -895,7 +886,8 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): image_latents (`Tensor`): image latents used to guide the image generation. Can be generated from vae_encoder step. control_image_latents (`Tensor`): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -1030,12 +1022,12 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -1057,19 +1049,21 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) + vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): @@ -1125,17 +1119,11 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): - for text-to-image generation, all you need to provide is `prompt` Components: - text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - controlnet (`QwenImageControlNetModel`) - control_image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`): + The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`) + control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: prompt (`str`, *optional*): @@ -1185,7 +1173,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. control_image_latents (`Tensor`, *optional*): - The control image latents to use for the denoising process. Can be generated in controlnet vae encoder step. + The control image latents to use for the denoising process. Can be generated in controlnet vae encoder + step. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. control_guidance_end (`float`, *optional*, defaults to 1.0): @@ -1195,7 +1184,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 0c1fa00842..e1e5c43354 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -13,11 +13,12 @@ # limitations under the License. from typing import Optional + import torch from ...utils import logging from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam from .before_denoise import ( QwenImageCreateMaskLatentsStep, QwenImageEditRoPEInputsStep, @@ -63,10 +64,8 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit VL encoder step that encode the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -113,9 +112,8 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -155,9 +153,8 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): - create image latents. Components: - image_resize_processor (`VaeImageProcessor`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -354,7 +351,10 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): model_name = "qwenimage-edit" block_classes = [ QwenImageTextInputsStep(), - QwenImageAdditionalInputsStep(additional_batch_inputs=[InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image")] + QwenImageAdditionalInputsStep( + additional_batch_inputs=[ + InputParam(name="processed_mask_image", type_hint=torch.Tensor, description="The processed mask image") + ] ), ] block_names = ["text_inputs", "additional_inputs"] @@ -377,15 +377,14 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): - Create the patchified latents `mask` based on the processed mask image. Components: - scheduler (`FlowMatchEulerDiscreteScheduler`) - pachifier (`QwenImagePachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`) Inputs: latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be generated from - vae encoder and updated in input step.) + image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be + generated from vae encoder and updated in input step.) timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): @@ -426,10 +425,8 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -502,10 +499,8 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit edit inpaint task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -623,12 +618,12 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocess the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -650,19 +645,21 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: - vae (`AutoencoderKLQwenImage`) - image_mask_processor (`InpaintProcessor`) + vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): @@ -719,19 +716,14 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_mask_processor (`InpaintProcessor`) - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae + (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler + (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: image (`Union[Image, List]`): @@ -771,7 +763,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. mask_overlay_kwargs (`Dict`, *optional*): - The kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep. + The kwargs for the postprocess step to apply the mask overlay. generated in + InpaintProcessImagesInputStep. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index 726c000f4b..37656cef5d 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks -from ..modular_pipeline_utils import InsertableDict, OutputParam, InputParam +from ..modular_pipeline_utils import InsertableDict, OutputParam from .before_denoise import ( QwenImageEditPlusRoPEInputsStep, QwenImagePrepareLatentsStep, @@ -55,10 +54,8 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): QwenImage-Edit Plus VL encoder step that encodes the image and text prompts together. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -107,9 +104,8 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): Each image is resized independently based on its own aspect ratio to 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -231,10 +227,8 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Edit Plus edit (img2img) task. Components: - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -311,12 +305,12 @@ class QwenImageEditPlusDecodeStep(SequentialPipelineBlocks): Decode step that decodes the latents to images and postprocesses the generated image. Components: - vae (`AutoencoderKLQwenImage`) - image_processor (`VaeImageProcessor`) + vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) Inputs: latents (`Tensor`): - The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step. + The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise + step. output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt'. @@ -357,14 +351,9 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): - VL encoder uses 384x384 target area, VAE encoder uses 1024x1024 target area. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImagePachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) guider (`ClassifierFreeGuidance`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 37a06e9af2..fdfeab0488 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import torch from ...utils import logging from ..modular_pipeline import SequentialPipelineBlocks from ..modular_pipeline_utils import InsertableDict, OutputParam @@ -53,14 +52,12 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`) Inputs: image (`Union[Image, List]`): @@ -116,9 +113,8 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): Vae encoder step that encode the image inputs into their latent representations. Components: - image_resize_processor (`VaeImageProcessor`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) + image_resize_processor (`VaeImageProcessor`) image_processor (`VaeImageProcessor`) vae + (`AutoencoderKLQwenImage`) Inputs: image (`Union[Image, List]`): @@ -203,8 +199,8 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): width (`int`): if not provided, updated to image width image_latents (`Tensor`): - image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified with layered - pachifier and batch-expanded) + image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified + with layered pachifier and batch-expanded) """ model_name = "qwenimage-layered" @@ -230,10 +226,8 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): Core denoising workflow for QwenImage-Layered img2img task. Components: - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - guider (`ClassifierFreeGuidance`) - transformer (`QwenImageTransformer2DModel`) + pachifier (`QwenImageLayeredPachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider + (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`) Inputs: num_images_per_prompt (`int`, *optional*, defaults to 1): @@ -317,16 +311,10 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Auto Modular pipeline for layered denoising tasks using QwenImage-Layered. Components: - image_resize_processor (`VaeImageProcessor`) - text_encoder (`Qwen2_5_VLForConditionalGeneration`) - processor (`Qwen2VLProcessor`) - tokenizer (`Qwen2Tokenizer`): The tokenizer to use - guider (`ClassifierFreeGuidance`) - image_processor (`VaeImageProcessor`) - vae (`AutoencoderKLQwenImage`) - pachifier (`QwenImageLayeredPachifier`) - scheduler (`FlowMatchEulerDiscreteScheduler`) - transformer (`QwenImageTransformer2DModel`) + image_resize_processor (`VaeImageProcessor`) text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor + (`Qwen2VLProcessor`) tokenizer (`Qwen2Tokenizer`): The tokenizer to use guider (`ClassifierFreeGuidance`) + image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`) pachifier (`QwenImageLayeredPachifier`) + scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`) Inputs: image (`Union[Image, List]`):