From f3e09114f200f8fbf88b060e218ac5a50464fe8d Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 12 Jun 2025 01:18:40 +0530 Subject: [PATCH] Improve Wan docstrings (#11689) * improve docstrings for wan * Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * make style --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/diffusers/pipelines/wan/pipeline_wan.py | 11 ++++--- .../pipelines/wan/pipeline_wan_i2v.py | 10 +++--- .../pipelines/wan/pipeline_wan_vace.py | 32 +++++++++++++++++-- .../pipelines/wan/pipeline_wan_video2video.py | 9 ++++-- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py index 3c0ac30bb6..6df66118b0 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan.py +++ b/src/diffusers/pipelines/wan/pipeline_wan.py @@ -388,8 +388,10 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin): Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. + The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds` + instead. Ignored when not using guidance (`guidance_scale` < `1`). height (`int`, defaults to `480`): The height in pixels of the generated image. width (`int`, defaults to `832`): @@ -434,8 +436,9 @@ class WanPipeline(DiffusionPipeline, WanLoraLoaderMixin): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. - autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`): - The dtype to use for the torch.amp.autocast. + max_sequence_length (`int`, defaults to `512`): + The maximum sequence length of the text encoder. If the prompt is longer than this, it will be + truncated. If the prompt is shorter, it will be padded to this length. Examples: diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 77f0e4d56a..c71138a97d 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -562,12 +562,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. - max_sequence_length (`int`, *optional*, defaults to `512`): - The maximum sequence length of the prompt. - shift (`float`, *optional*, defaults to `5.0`): - The shift of the flow. - autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`): - The dtype to use for the torch.amp.autocast. + max_sequence_length (`int`, defaults to `512`): + The maximum sequence length of the text encoder. If the prompt is longer than this, it will be + truncated. If the prompt is shorter, it will be padded to this length. + Examples: Returns: diff --git a/src/diffusers/pipelines/wan/pipeline_wan_vace.py b/src/diffusers/pipelines/wan/pipeline_wan_vace.py index e029006aa1..a0b5ed93c9 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_vace.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_vace.py @@ -687,8 +687,33 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin): Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` instead. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + video (`List[PIL.Image.Image]`, *optional*): + The input video or videos to be used as a starting point for the generation. The video should be a list + of PIL images, a numpy array, or a torch tensor. Currently, the pipeline only supports generating one + video at a time. + mask (`List[PIL.Image.Image]`, *optional*): + The input mask defines which video regions to condition on and which to generate. Black areas in the + mask indicate conditioning regions, while white areas indicate regions for generation. The mask should + be a list of PIL images, a numpy array, or a torch tensor. Currently supports generating a single video + at a time. + reference_images (`List[PIL.Image.Image]`, *optional*): + A list of one or more reference images as extra conditioning for the generation. For example, if you + are trying to inpaint a video to change the character, you can pass reference images of the new + character here. Refer to the Diffusers [examples](https://github.com/huggingface/diffusers/pull/11582) + and original [user + guide](https://github.com/ali-vilab/VACE/blob/0897c6d055d7d9ea9e191dce763006664d9780f8/UserGuide.md) + for a full list of supported tasks and use cases. + conditioning_scale (`float`, `List[float]`, `torch.Tensor`, defaults to `1.0`): + The conditioning scale to be applied when adding the control conditioning latent stream to the + denoising latent stream in each control layer of the model. If a float is provided, it will be applied + uniformly to all layers. If a list or tensor is provided, it should have the same length as the number + of control layers in the model (`len(transformer.config.vace_layers)`). height (`int`, defaults to `480`): The height in pixels of the generated image. width (`int`, defaults to `832`): @@ -733,8 +758,9 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. - autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`): - The dtype to use for the torch.amp.autocast. + max_sequence_length (`int`, defaults to `512`): + The maximum sequence length of the text encoder. If the prompt is longer than this, it will be + truncated. If the prompt is shorter, it will be padded to this length. Examples: diff --git a/src/diffusers/pipelines/wan/pipeline_wan_video2video.py b/src/diffusers/pipelines/wan/pipeline_wan_video2video.py index a4a10d4655..1a2d2e9c22 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_video2video.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_video2video.py @@ -508,7 +508,7 @@ class WanVideoToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin): Args: prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` instead. height (`int`, defaults to `480`): The height in pixels of the generated image. @@ -525,6 +525,8 @@ class WanVideoToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin): of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + strength (`float`, defaults to `0.8`): + Higher strength leads to more differences between original image and generated video. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -554,8 +556,9 @@ class WanVideoToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the `._callback_tensor_inputs` attribute of your pipeline class. - autocast_dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`): - The dtype to use for the torch.amp.autocast. + max_sequence_length (`int`, defaults to `512`): + The maximum sequence length of the text encoder. If the prompt is longer than this, it will be + truncated. If the prompt is shorter, it will be padded to this length. Examples: