From b29873dee72ea60e155a2a14a72e6e6ee6195b63 Mon Sep 17 00:00:00 2001 From: yiyixuxu Date: Sat, 10 Jan 2026 10:52:53 +0100 Subject: [PATCH] up up --- .../modular_pipeline_utils.py | 59 ++++++++++++------ .../qwenimage/before_denoise.py | 6 +- .../modular_pipelines/qwenimage/decoders.py | 28 +++------ .../modular_pipelines/qwenimage/denoise.py | 16 ++--- .../modular_pipelines/qwenimage/encoders.py | 61 ++++++++++--------- .../modular_pipelines/qwenimage/inputs.py | 34 +++++------ .../qwenimage/modular_blocks_qwenimage.py | 26 ++++++-- .../modular_pipelines/z_image/denoise.py | 5 +- 8 files changed, 126 insertions(+), 109 deletions(-) diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index afc4d6959a..cb179eccc7 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -342,6 +342,18 @@ class InputParam: def __repr__(self): return f"<{self.name}: {'required' if self.required else 'optional'}, default={self.default}>" + + @classmethod + def template(cls, name: str) -> Optional["InputParam"]: + """Get template for name if exists, otherwise None.""" + if hasattr(cls, name) and callable(getattr(cls, name)): + return getattr(cls, name)() + return None + + # ====================================================== + # InputParam templates + # ====================================================== + @classmethod def prompt(cls) -> "InputParam": return cls(name="prompt", type_hint=str, required=True, @@ -383,7 +395,6 @@ class InputParam: return cls(name="generator", type_hint=torch.Generator, default=None, description="Torch generator for deterministic generation.") - @classmethod def sigmas(cls) -> "InputParam": return cls(name="sigmas", type_hint=List[float], default=None, @@ -394,6 +405,7 @@ class InputParam: return cls(name="strength", type_hint=float, default=default, description="Strength for img2img/inpainting.") + # images @classmethod def image(cls) -> "InputParam": return cls(name="image", type_hint=PIL.Image.Image, required=True, @@ -425,12 +437,24 @@ class InputParam: def timesteps(cls) -> "InputParam": return cls(name="timesteps", type_hint=torch.Tensor, default=None, description="Timesteps for the denoising process.") - - - # ===================================================================== - # ControlNet - # ===================================================================== + @classmethod + def output_type(cls) -> "InputParam": + return cls(name="output_type", type_hint=str, default="pil", + description="Output format: 'pil', 'np', 'pt''.") + + @classmethod + def attention_kwargs(cls) -> "InputParam": + return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, + description="Additional kwargs for attention processors.") + + @classmethod + def denoiser_input_fields(cls) -> "InputParam": + return cls(kwargs_type="denoiser_input_fields", type_hint=torch.Tensor, + description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.") + + + # ControlNet @classmethod def control_guidance_start(cls, default: float = 0.0) -> "InputParam": return cls(name="control_guidance_start", type_hint=float, default=default, @@ -446,18 +470,6 @@ class InputParam: return cls(name="controlnet_conditioning_scale", type_hint=float, default=default, description="Scale for ControlNet conditioning.") - - @classmethod - def output_type(cls) -> "InputParam": - return cls(name="output_type", type_hint=str, default="pil", - description="Output format: 'pil', 'np', 'pt', or 'latent'.") - - @classmethod - def attention_kwargs(cls) -> "InputParam": - return cls(name="attention_kwargs", type_hint=Dict[str, Any], default=None, - description="Additional kwargs for attention processors.") - - @dataclass class OutputParam: """Specification for an output parameter.""" @@ -472,6 +484,17 @@ class OutputParam: f"<{self.name}: {self.type_hint.__name__ if hasattr(self.type_hint, '__name__') else str(self.type_hint)}>" ) + @classmethod + def template(cls, name: str) -> Optional["OutputParam"]: + """Get template for name if exists, otherwise None.""" + if hasattr(cls, name) and callable(getattr(cls, name)): + return getattr(cls, name)() + return None + + # ====================================================== + # OutputParam templates + # ====================================================== + @classmethod def images(cls) -> "OutputParam": return cls(name="images", type_hint=List[PIL.Image.Image], diff --git a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py index 6fa4a971c2..d61711e13a 100644 --- a/src/diffusers/modular_pipelines/qwenimage/before_denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/before_denoise.py @@ -228,7 +228,7 @@ class QwenImageLayeredPrepareLatentsStep(ModularPipelineBlocks): InputParam.latents(), InputParam.height(), InputParam.width(), - InputParam(name="layers", type_hint=int, default=4), + InputParam(name="layers", type_hint=int, default=4, description="Number of layers to extract from the image"), InputParam.num_images_per_prompt(), InputParam.generator(), InputParam( @@ -598,7 +598,7 @@ class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks): type_hint=torch.Tensor, description="The latents to use for the denoising process, used to calculate the image sequence length.", ), - InputParam(name="strength", default=0.9), + InputParam.strength(0.9), ] @property @@ -886,7 +886,7 @@ class QwenImageLayeredRoPEInputsStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam(name="batch_size", required=True), - InputParam(name="layers", required=True), + InputParam(name="layers", default=4, description="Number of layers to extract from the image"), InputParam(name="height", required=True), InputParam(name="width", required=True), InputParam(name="prompt_embeds_mask"), diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py index 24a88ebfca..9c3a1c01d0 100644 --- a/src/diffusers/modular_pipelines/qwenimage/decoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/decoders.py @@ -91,7 +91,7 @@ class QwenImageLayeredAfterDenoiseStep(ModularPipelineBlocks): InputParam("latents", required=True, type_hint=torch.Tensor), InputParam("height", required=True, type_hint=int), InputParam("width", required=True, type_hint=int), - InputParam("layers", required=True, type_hint=int), + InputParam("layers", default=4, description="Number of layers to extract from the image"), ] @torch.no_grad() @@ -141,11 +141,7 @@ class QwenImageDecoderStep(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[str]: return [ - OutputParam( - "images", - type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], - description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array", - ) + OutputParam.images() ] @torch.no_grad() @@ -198,14 +194,14 @@ class QwenImageLayeredDecoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("latents", required=True, type_hint=torch.Tensor), - InputParam("output_type", default="pil", type_hint=str), + InputParam("latents", required=True, type_hint=torch.Tensor, description="The latents to decode, can be generated in the denoise step"), + InputParam.output_type(), ] @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam(name="images", type_hint=List[List[PIL.Image.Image]]), + OutputParam.images(), ] @torch.no_grad() @@ -273,12 +269,7 @@ class QwenImageProcessImagesOutputStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam("images", required=True, description="the generated image from decoders step"), - InputParam( - name="output_type", - default="pil", - type_hint=str, - description="The type of the output images, can be 'pil', 'np', 'pt'", - ), + InputParam.output_type(), ] @staticmethod @@ -323,12 +314,7 @@ class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks): def inputs(self) -> List[InputParam]: return [ InputParam("images", required=True, description="the generated image from decoders step"), - InputParam( - name="output_type", - default="pil", - type_hint=str, - description="The type of the output images, can be 'pil', 'np', 'pt'", - ), + InputParam.output_type(), InputParam("mask_overlay_kwargs"), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/denoise.py b/src/diffusers/modular_pipelines/qwenimage/denoise.py index eb1e5a341c..472945b226 100644 --- a/src/diffusers/modular_pipelines/qwenimage/denoise.py +++ b/src/diffusers/modular_pipelines/qwenimage/denoise.py @@ -218,7 +218,7 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("attention_kwargs"), + InputParam.attention_kwargs(), InputParam( "latents", required=True, @@ -231,10 +231,7 @@ class QwenImageLoopDenoiser(ModularPipelineBlocks): type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), InputParam( "img_shapes", required=True, @@ -322,7 +319,7 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("attention_kwargs"), + InputParam.attention_kwargs(), InputParam( "latents", required=True, @@ -335,10 +332,7 @@ class QwenImageEditLoopDenoiser(ModularPipelineBlocks): type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), InputParam( "img_shapes", required=True, @@ -424,7 +418,7 @@ class QwenImageLoopAfterDenoiser(ModularPipelineBlocks): @property def intermediate_outputs(self) -> List[OutputParam]: return [ - OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."), + OutputParam.latents(), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index 4b66dd32e5..2eca8645ef 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -301,8 +301,8 @@ class QwenImageEditResizeStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam( - name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" + InputParam.template(self._image_input_name) or InputParam( + name=self._image_input_name, required=True, type_hint=torch.Tensor, description="Input image for conditioning" ), ] @@ -381,7 +381,7 @@ class QwenImageLayeredResizeStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam( + InputParam.template(self._image_input_name) or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize" ), InputParam( @@ -484,7 +484,7 @@ class QwenImageEditPlusResizeStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam( + InputParam.template(self._image_input_name) or InputParam( name=self._image_input_name, required=True, type_hint=torch.Tensor, @@ -564,7 +564,7 @@ class QwenImageLayeredGetImagePromptStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", type_hint=str, description="The prompt to encode"), + InputParam(name="prompt", type_hint=str, description="The prompt to encode"), # it is not required for qwenimage-layered, unlike other pipelines InputParam( name="resized_image", required=True, @@ -647,11 +647,9 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), - InputParam( - name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024 - ), + InputParam.prompt(), + InputParam.negative_prompt(), + InputParam.max_sequence_length(1024), ] @property @@ -772,8 +770,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam.prompt(), + InputParam.negative_prompt(), InputParam( name="resized_image", required=True, @@ -895,8 +893,8 @@ class QwenImageEditPlusTextEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"), - InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"), + InputParam.prompt(), + InputParam.negative_prompt(), InputParam( name="resized_cond_image", required=True, @@ -1010,11 +1008,11 @@ class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("mask_image", required=True), - InputParam("image", required=True), - InputParam("height"), - InputParam("width"), - InputParam("padding_mask_crop"), + InputParam.mask_image(), + InputParam.image(), + InputParam.height(), + InputParam.width(), + InputParam.padding_mask_crop(), ] @property @@ -1082,9 +1080,9 @@ class QwenImageEditInpaintProcessImagesInputStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("mask_image", required=True), - InputParam("resized_image", required=True), - InputParam("padding_mask_crop"), + InputParam.mask_image(), + InputParam("resized_image", required=True, type_hint=PIL.Image.Image, description="The resized image. should be generated using a resize step"), + InputParam.padding_mask_crop(), ] @property @@ -1140,9 +1138,9 @@ class QwenImageProcessImagesInputStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam("image", required=True), - InputParam("height"), - InputParam("width"), + InputParam.image(), + InputParam.height(), + InputParam.width(), ] @property @@ -1312,7 +1310,10 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: - return [InputParam(self._image_input_name, required=True), InputParam("generator")] + return [ + InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), + InputParam.generator(), + ] @property def intermediate_outputs(self) -> List[OutputParam]: @@ -1383,10 +1384,10 @@ class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam("control_image", required=True), - InputParam("height"), - InputParam("width"), - InputParam("generator"), + InputParam.control_image(), + InputParam.height(), + InputParam.width(), + InputParam.generator(), ] return inputs diff --git a/src/diffusers/modular_pipelines/qwenimage/inputs.py b/src/diffusers/modular_pipelines/qwenimage/inputs.py index 4a1cf3700c..e28493ecc3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/inputs.py +++ b/src/diffusers/modular_pipelines/qwenimage/inputs.py @@ -129,7 +129,7 @@ class QwenImageTextInputsStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"), InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"), InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"), @@ -269,17 +269,17 @@ class QwenImageAdditionalInputsStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), - InputParam(name="height"), - InputParam(name="width"), + InputParam.height(), + InputParam.width(), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -398,17 +398,17 @@ class QwenImageEditPlusAdditionalInputsStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), - InputParam(name="height"), - InputParam(name="width"), + InputParam.height(), + InputParam.width(), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -544,15 +544,15 @@ class QwenImageLayeredAdditionalInputsStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: inputs = [ - InputParam(name="num_images_per_prompt", default=1), + InputParam.num_images_per_prompt(), InputParam(name="batch_size", required=True), ] for image_latent_input_name in self._image_latent_inputs: - inputs.append(InputParam(name=image_latent_input_name)) + inputs.append(InputParam.template(image_latent_input_name) or InputParam(name=image_latent_input_name)) for input_name in self._additional_batch_inputs: - inputs.append(InputParam(name=input_name)) + inputs.append(InputParam.template(input_name) or InputParam(name=input_name)) return inputs @@ -638,9 +638,9 @@ class QwenImageControlNetInputsStep(ModularPipelineBlocks): return [ InputParam(name="control_image_latents", required=True), InputParam(name="batch_size", required=True), - InputParam(name="num_images_per_prompt", default=1), - InputParam(name="height"), - InputParam(name="width"), + InputParam.num_images_per_prompt(), + InputParam.height(), + InputParam.width(), ] @torch.no_grad() diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 63e9f5a283..c349c7d9f2 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -54,7 +54,23 @@ logger = logging.get_logger(__name__) # ==================== -# 1. VAE ENCODER +# 1. TEXT ENCODER +# ==================== + +class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): + model_name = "qwenimage" + block_classes = [QwenImageTextEncoderStep()] + block_names = ["text_encoder"] + block_trigger_inputs = ["prompt"] + + @property + def description(self) -> str: + return "Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block." + " - `QwenImageTextEncoderStep` (text_encoder) is used when `prompt` is provided." + " - if `prompt` is not provided, step will be skipped." + +# ==================== +# 2. VAE ENCODER # ==================== @@ -118,7 +134,7 @@ class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks): # ==================== -# 2. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) +# 3. DENOISE (input -> prepare_latents -> set_timesteps -> prepare_rope_inputs -> denoise -> after_denoise) # ==================== @@ -396,7 +412,7 @@ class QwenImageAutoCoreDenoiseStep(ConditionalPipelineBlocks): # ==================== -# 3. DECODE +# 4. DECODE # ==================== @@ -439,11 +455,11 @@ class QwenImageAutoDecodeStep(AutoPipelineBlocks): # ==================== -# 4. AUTO BLOCKS & PRESETS +# 5. AUTO BLOCKS & PRESETS # ==================== AUTO_BLOCKS = InsertableDict( [ - ("text_encoder", QwenImageTextEncoderStep()), + ("text_encoder", QwenImageAutoTextEncoderStep()), ("vae_encoder", QwenImageAutoVaeEncoderStep()), ("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()), ("denoise", QwenImageAutoCoreDenoiseStep()), diff --git a/src/diffusers/modular_pipelines/z_image/denoise.py b/src/diffusers/modular_pipelines/z_image/denoise.py index 3d5a00a9df..a165fb513f 100644 --- a/src/diffusers/modular_pipelines/z_image/denoise.py +++ b/src/diffusers/modular_pipelines/z_image/denoise.py @@ -129,10 +129,7 @@ class ZImageLoopDenoiser(ModularPipelineBlocks): type_hint=int, description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.", ), - InputParam( - kwargs_type="denoiser_input_fields", - description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.", - ), + InputParam.denoiser_input_fields(), ] guider_input_names = [] uncond_guider_input_names = []