diff --git a/src/diffusers/modular_pipelines/modular_pipeline_utils.py b/src/diffusers/modular_pipelines/modular_pipeline_utils.py index 368fbbcbd1..45556c538a 100644 --- a/src/diffusers/modular_pipelines/modular_pipeline_utils.py +++ b/src/diffusers/modular_pipelines/modular_pipeline_utils.py @@ -708,6 +708,8 @@ def format_params(params, header="Args", indent_level=4, max_line_length=115): desc = re.sub(r"\[(.*?)\]\((https?://[^\s\)]+)\)", r"[\1](\2)", param.description) wrapped_desc = wrap_text(desc, desc_indent, max_line_length) param_str += f"\n{desc_indent}{wrapped_desc}" + else: + param_str += f"\n{desc_indent}TODO: Add description." formatted_params.append(param_str) diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py index f0dd6471b1..8d7b190542 100644 --- a/src/diffusers/modular_pipelines/qwenimage/encoders.py +++ b/src/diffusers/modular_pipelines/qwenimage/encoders.py @@ -1324,7 +1324,8 @@ class QwenImageVaeEncoderStep(ModularPipelineBlocks): @property def inputs(self) -> List[InputParam]: return [ - InputParam.template(self._image_input_name) or InputParam(name=self._image_input_name, required=True), + InputParam.template(self._image_input_name) + or InputParam(name=self._image_input_name, required=True, description="The image tensor to encode"), InputParam.generator(), ] diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py index 3bd4ae5683..645c01f66e 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.py @@ -75,11 +75,8 @@ class QwenImageAutoTextEncoderStep(AutoPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -151,7 +148,9 @@ class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks): Outputs: processed_image (`None`): + TODO: Add description. processed_mask_image (`None`): + TODO: Add description. mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): @@ -195,6 +194,7 @@ class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks): Outputs: processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -290,14 +290,19 @@ class QwenImageImg2ImgInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -334,15 +339,21 @@ class QwenImageInpaintInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -389,14 +400,18 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. height (`None`): + TODO: Add description. width (`None`): + TODO: Add description. dtype (`None`): + TODO: Add description. Outputs: initial_noise (`Tensor`): @@ -425,7 +440,8 @@ class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks): # auto_docstring class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: @@ -441,9 +457,13 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. height (`int`, *optional*): @@ -499,7 +519,8 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: @@ -515,15 +536,21 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -579,7 +606,8 @@ class QwenImageInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: @@ -595,14 +623,19 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -658,7 +691,8 @@ class QwenImageImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): """ - step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.). + step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs + (timesteps, latents, rope inputs etc.). Components: @@ -676,10 +710,15 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): @@ -746,7 +785,8 @@ class QwenImageControlNetCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint + task. Components: @@ -764,16 +804,23 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -840,7 +887,8 @@ class QwenImageControlNetInpaintCoreDenoiseStep(SequentialPipelineBlocks): # auto_docstring class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): """ - Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task. + Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img + task. Components: @@ -858,15 +906,21 @@ class QwenImageControlNetImg2ImgCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. control_image_latents (`None`): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -1031,7 +1085,8 @@ class QwenImageDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask + overally to the original image. Components: @@ -1045,6 +1100,7 @@ class QwenImageInpaintDecodeStep(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): @@ -1126,11 +1182,8 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -1160,9 +1213,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -1174,10 +1231,13 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): **denoiser_input_fields (`Tensor`, *optional*): conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. strength (`float`, *optional*, defaults to 0.9): Strength for img2img/inpainting. control_image_latents (`None`, *optional*): + TODO: Add description. control_guidance_start (`float`, *optional*, defaults to 0.0): When to start applying ControlNet. control_guidance_end (`float`, *optional*, defaults to 1.0): @@ -1187,6 +1247,7 @@ class QwenImageAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py index 627cfce6ee..0bfbb921c9 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit.py @@ -74,11 +74,10 @@ class QwenImageEditVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -144,6 +143,7 @@ class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -192,7 +192,9 @@ class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. processed_mask_image (`None`): + TODO: Add description. mask_overlay_kwargs (`Dict`): The kwargs for the postprocess step to apply the mask overlay image_latents (`Tensor`): @@ -255,14 +257,19 @@ class QwenImageEditInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -306,15 +313,21 @@ class QwenImageEditInpaintInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -363,14 +376,18 @@ class QwenImageEditInpaintPrepareLatentsStep(SequentialPipelineBlocks): latents (`Tensor`): The initial random noised, can be generated in prepare latent step. image_latents (`Tensor`): - The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step. + The image latents to use for the denoising process. Can be generated in vae encoder and packed in input + step. timesteps (`Tensor`): The timesteps to use for the denoising process. Can be generated in set_timesteps step. processed_mask_image (`Tensor`): The processed mask to use for the inpainting process. height (`None`): + TODO: Add description. width (`None`): + TODO: Add description. dtype (`None`): + TODO: Add description. Outputs: initial_noise (`Tensor`): @@ -412,14 +429,19 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -487,15 +509,21 @@ class QwenImageEditInpaintCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -622,7 +650,8 @@ class QwenImageEditDecodeStep(SequentialPipelineBlocks): # auto_docstring class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): """ - Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask overlay to the original image. + Decode step that decodes the latents to images and postprocess the generated image, optionally apply the mask + overlay to the original image. Components: @@ -636,6 +665,7 @@ class QwenImageEditInpaintDecodeStep(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): @@ -692,7 +722,8 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): """ Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit. - for edit (img2img) generation, you need to provide `image` - - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` + - for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide + `padding_mask_crop` Components: @@ -719,11 +750,10 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + <|vision_start|><|image_pad|><|vision_end|>{}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 64) @@ -747,7 +777,9 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): width (`int`): The width in pixels of the generated image. image_latents (`None`): + TODO: Add description. processed_mask_image (`None`, *optional*): + TODO: Add description. latents (`Tensor`): Pre-generated noisy latents for image generation. num_inference_steps (`int`): @@ -763,6 +795,7 @@ class QwenImageEditAutoBlocks(SequentialPipelineBlocks): output_type (`str`, *optional*, defaults to pil): Output format: 'pil', 'np', 'pt''. mask_overlay_kwargs (`None`, *optional*): + TODO: Add description. Outputs: images (`List`): diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py index cc07fc1e6a..8dab6fbcf9 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_edit_plus.py @@ -67,11 +67,10 @@ class QwenImageEditPlusVLEncoderStep(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) @@ -139,6 +138,7 @@ class QwenImageEditPlusVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -182,14 +182,19 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -240,14 +245,19 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. height (`int`, *optional*): The height in pixels of the generated image. width (`int`, *optional*): The width in pixels of the generated image. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. generator (`Generator`, *optional*): @@ -376,11 +386,10 @@ class QwenImageEditPlusAutoBlocks(SequentialPipelineBlocks): Configs: prompt_template_encode (default: <|im_start|>system - Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how + the user's text instruction should alter or modify the image. Generate a new image that meets the user's + requirements while maintaining consistency with the original input where appropriate.<|im_end|> <|im_start|>user + {}<|im_end|> <|im_start|>assistant ) img_template_encode (default: Picture {}: <|vision_start|><|image_pad|><|vision_end|>) diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py index 7cbc174871..544b1abfc3 100644 --- a/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py +++ b/src/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage_layered.py @@ -53,7 +53,8 @@ logger = logging.get_logger(__name__) # auto_docstring class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): """ - QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not provided. + QwenImage-Layered Text encoder step that encode the text prompt, will generate a prompt based on image if not + provided. Components: @@ -70,28 +71,23 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -101,16 +97,11 @@ class QwenImageLayeredTextEncoderStep(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) @@ -187,6 +178,7 @@ class QwenImageLayeredVaeEncoderStep(SequentialPipelineBlocks): resized_image (`List`): The resized images processed_image (`None`): + TODO: Add description. image_latents (`Tensor`): The latents representing the reference image(s). Single tensor or list depending on input. """ @@ -226,10 +218,15 @@ class QwenImageLayeredInputStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. image_latents (`None`, *optional*): + TODO: Add description. Outputs: batch_size (`int`): @@ -282,10 +279,15 @@ class QwenImageLayeredCoreDenoiseStep(SequentialPipelineBlocks): num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. prompt_embeds (`None`): + TODO: Add description. prompt_embeds_mask (`None`): + TODO: Add description. negative_prompt_embeds (`None`, *optional*): + TODO: Add description. negative_prompt_embeds_mask (`None`, *optional*): + TODO: Add description. image_latents (`None`, *optional*): + TODO: Add description. latents (`Tensor`, *optional*): Pre-generated noisy latents for image generation. layers (`int`, *optional*, defaults to 4): @@ -379,28 +381,23 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): Configs: image_caption_prompt_en (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # Image Annotator - You are a professional image annotator. Please write an image caption based on the input image: + You are a helpful assistant.<|im_end|> <|im_start|>user # Image Annotator You are a professional image annotator. + Please write an image caption based on the input image: 1. Write the caption using natural, descriptive language without structured formats or rich text. 2. Enrich caption details by including: - Object attributes, such as quantity, color, shape, size, material, state, position, actions, and so on - - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, attachment relations, action relations, comparative relations, causal relations, and so on + - Vision Relations between objects, such as spatial relations, functional relations, possessive relations, + attachment relations, action relations, comparative relations, causal relations, and so on - Environmental details, such as weather, lighting, colors, textures, atmosphere, and so on - - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the caption with quotation marks + - Identify the text clearly visible in the image, without translation or explanation, and highlight it in the + caption with quotation marks 3. Maintain authenticity and accuracy: - Avoid generalizations - Describe all visible information in the image, while do not add information not explicitly shown in the image - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) image_caption_prompt_cn (default: <|im_start|>system - You are a helpful assistant.<|im_end|> - <|im_start|>user - # 图像标注器 - 你是一个专业的图像标注器。请基于输入图像,撰写图注: + You are a helpful assistant.<|im_end|> <|im_start|>user # 图像标注器 你是一个专业的图像标注器。请基于输入图像,撰写图注: 1. 使用自然、描述性的语言撰写图注,不要使用结构化形式或富文本形式。 2. 通过加入以下内容,丰富图注细节: - 对象的属性:如数量、颜色、形状、大小、位置、材质、状态、动作等 @@ -410,16 +407,11 @@ class QwenImageLayeredAutoBlocks(SequentialPipelineBlocks): 3. 保持真实性与准确性: - 不要使用笼统的描述 - 描述图像中所有可见的信息,但不要加入没有在图像中出现的内容 - <|vision_start|><|image_pad|><|vision_end|><|im_end|> - <|im_start|>assistant - ) + <|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant ) prompt_template_encode (default: <|im_start|>system - Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|> - <|im_start|>user - {}<|im_end|> - <|im_start|>assistant - ) + Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the + objects and background:<|im_end|> <|im_start|>user {}<|im_end|> <|im_start|>assistant ) prompt_template_encode_start_idx (default: 34) diff --git a/utils/modular_auto_docstring.py b/utils/modular_auto_docstring.py index 01d984a584..7bb2c87e81 100644 --- a/utils/modular_auto_docstring.py +++ b/utils/modular_auto_docstring.py @@ -297,4 +297,4 @@ if __name__ == "__main__": args = parser.parse_args() - check_auto_docstrings(args.path, args.fix_and_overwrite) \ No newline at end of file + check_auto_docstrings(args.path, args.fix_and_overwrite)