From 5a47442f9221eba3abe3e27c7af8304241ac58ca Mon Sep 17 00:00:00 2001 From: Cesaryuan <35998162+cesaryuan@users.noreply.github.com> Date: Tue, 11 Nov 2025 05:57:52 +0800 Subject: [PATCH] Fix: update type hints for Tuple parameters across multiple files to support variable-length tuples (#12544) * Fix: update type hints for Tuple parameters across multiple files to support variable-length tuples * Apply style fixes --------- Co-authored-by: github-actions[bot] --- examples/community/img2img_inpainting.py | 2 +- examples/community/matryoshka.py | 17 ++++++++----- .../pipeline_faithdiff_stable_diffusion_xl.py | 11 +++++--- .../models/autoencoders/autoencoder_dc.py | 14 +++++------ .../models/autoencoders/autoencoder_kl.py | 6 ++--- .../autoencoders/autoencoder_kl_cogvideox.py | 6 ++--- .../autoencoder_kl_hunyuan_video.py | 2 +- .../autoencoder_kl_hunyuanimage_refiner.py | 2 +- .../autoencoders/autoencoder_kl_mochi.py | 4 +-- .../autoencoders/autoencoder_kl_qwenimage.py | 2 +- .../autoencoder_kl_temporal_decoder.py | 6 ++--- .../models/autoencoders/autoencoder_kl_wan.py | 2 +- .../models/controlnets/controlnet_xs.py | 25 +++++++++++-------- .../transformers/transformer_hunyuan_video.py | 2 +- .../transformer_hunyuan_video_framepack.py | 2 +- .../transformers/transformer_hunyuanimage.py | 2 +- .../transformers/transformer_skyreels_v2.py | 2 +- .../models/transformers/transformer_wan.py | 2 +- .../transformers/transformer_wan_vace.py | 2 +- src/diffusers/models/unets/unet_1d.py | 8 +++--- .../models/unets/unet_2d_condition.py | 17 ++++++++----- src/diffusers/models/unets/unet_kandinsky3.py | 2 +- .../unets/unet_spatio_temporal_condition.py | 8 +++--- .../models/unets/unet_stable_cascade.py | 10 ++++---- src/diffusers/models/vae_flax.py | 14 +++++------ .../pipelines/audioldm2/modeling_audioldm2.py | 11 +++++--- .../versatile_diffusion/modeling_text_unet.py | 6 ++--- src/diffusers/pipelines/shap_e/renderer.py | 8 +++--- 28 files changed, 110 insertions(+), 85 deletions(-) diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 595df107ca..bef682425a 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -45,7 +45,7 @@ def check_size(image, height, width): raise ValueError(f"Image size should be {height}x{width}, but got {h}x{w}") -def overlay_inner_image(image, inner_image, paste_offset: Tuple[int] = (0, 0)): +def overlay_inner_image(image, inner_image, paste_offset: Tuple[int, ...] = (0, 0)): inner_image = inner_image.convert("RGBA") image = image.convert("RGB") diff --git a/examples/community/matryoshka.py b/examples/community/matryoshka.py index 97ad8b9e86..09b1d1b244 100644 --- a/examples/community/matryoshka.py +++ b/examples/community/matryoshka.py @@ -1966,16 +1966,21 @@ class MatryoshkaUNet2DConditionModel( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, @@ -2294,10 +2299,10 @@ class MatryoshkaUNet2DConditionModel( def _check_config( self, - down_block_types: Tuple[str], - up_block_types: Tuple[str], + down_block_types: Tuple[str, ...], + up_block_types: Tuple[str, ...], only_cross_attention: Union[bool, Tuple[bool]], - block_out_channels: Tuple[int], + block_out_channels: Tuple[int, ...], layers_per_block: Union[int, Tuple[int]], cross_attention_dim: Union[int, Tuple[int]], transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]], diff --git a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py index a8fdc133d0..3ebaed1fd5 100644 --- a/examples/community/pipeline_faithdiff_stable_diffusion_xl.py +++ b/examples/community/pipeline_faithdiff_stable_diffusion_xl.py @@ -438,16 +438,21 @@ class UNet2DConditionModel(OriginalUNet2DConditionModel, ConfigMixin, UNet2DCond center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/models/autoencoders/autoencoder_dc.py b/src/diffusers/models/autoencoders/autoencoder_dc.py index 724ec3bb76..ec301ef8ad 100644 --- a/src/diffusers/models/autoencoders/autoencoder_dc.py +++ b/src/diffusers/models/autoencoders/autoencoder_dc.py @@ -102,7 +102,7 @@ def get_block( attention_head_dim: int, norm_type: str, act_fn: str, - qkv_mutliscales: Tuple[int] = (), + qkv_mutliscales: Tuple[int, ...] = (), ): if block_type == "ResBlock": block = ResBlock(in_channels, out_channels, norm_type, act_fn) @@ -206,8 +206,8 @@ class Encoder(nn.Module): latent_channels: int, attention_head_dim: int = 32, block_type: Union[str, Tuple[str]] = "ResBlock", - block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024), - layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), + layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2), qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), downsample_block_type: str = "pixel_unshuffle", out_shortcut: bool = True, @@ -292,8 +292,8 @@ class Decoder(nn.Module): latent_channels: int, attention_head_dim: int = 32, block_type: Union[str, Tuple[str]] = "ResBlock", - block_out_channels: Tuple[int] = (128, 256, 512, 512, 1024, 1024), - layers_per_block: Tuple[int] = (2, 2, 2, 2, 2, 2), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), + layers_per_block: Tuple[int, ...] = (2, 2, 2, 2, 2, 2), qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), norm_type: Union[str, Tuple[str]] = "rms_norm", act_fn: Union[str, Tuple[str]] = "silu", @@ -440,8 +440,8 @@ class AutoencoderDC(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModel decoder_block_types: Union[str, Tuple[str]] = "ResBlock", encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512, 1024, 1024), - encoder_layers_per_block: Tuple[int] = (2, 2, 2, 3, 3, 3), - decoder_layers_per_block: Tuple[int] = (3, 3, 3, 3, 3, 3), + encoder_layers_per_block: Tuple[int, ...] = (2, 2, 2, 3, 3, 3), + decoder_layers_per_block: Tuple[int, ...] = (3, 3, 3, 3, 3, 3), encoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), decoder_qkv_multiscales: Tuple[Tuple[int, ...], ...] = ((), (), (), (5,), (5,), (5,)), upsample_block_type: str = "pixel_shuffle", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py index 1a72aa3cfe..ffc8778e7a 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl.py @@ -78,9 +78,9 @@ class AutoencoderKL(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalModel self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - up_block_types: Tuple[str] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int] = (64,), + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), layers_per_block: int = 1, act_fn: str = "silu", latent_channels: int = 4, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py index 5096b725d0..79433f7b92 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py @@ -995,19 +995,19 @@ class AutoencoderKLCogVideoX(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", "CogVideoXDownBlock3D", ), - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", "CogVideoXUpBlock3D", ), - block_out_channels: Tuple[int] = (128, 256, 256, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 256, 512), latent_channels: int = 16, layers_per_block: int = 3, act_fn: str = "silu", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py index 88b9bb507f..ddc0aed6b0 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.py @@ -653,7 +653,7 @@ class AutoencoderKLHunyuanVideo(ModelMixin, AutoencoderMixin, ConfigMixin): "HunyuanVideoUpBlock3D", "HunyuanVideoUpBlock3D", ), - block_out_channels: Tuple[int] = (128, 256, 512, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512), layers_per_block: int = 2, act_fn: str = "silu", norm_num_groups: int = 32, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py index af40c7a6cb..2249063a9f 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_hunyuanimage_refiner.py @@ -601,7 +601,7 @@ class AutoencoderKLHunyuanImageRefiner(ModelMixin, ConfigMixin): in_channels: int = 3, out_channels: int = 3, latent_channels: int = 32, - block_out_channels: Tuple[int] = (128, 256, 512, 1024, 1024), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 1024, 1024), layers_per_block: int = 2, spatial_compression_ratio: int = 16, temporal_compression_ratio: int = 4, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py index 3ded9a0a54..7a64ac7de1 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_mochi.py @@ -688,8 +688,8 @@ class AutoencoderKLMochi(ModelMixin, AutoencoderMixin, ConfigMixin): self, in_channels: int = 15, out_channels: int = 3, - encoder_block_out_channels: Tuple[int] = (64, 128, 256, 384), - decoder_block_out_channels: Tuple[int] = (128, 256, 512, 768), + encoder_block_out_channels: Tuple[int, ...] = (64, 128, 256, 384), + decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 768), latent_channels: int = 12, layers_per_block: Tuple[int, ...] = (3, 3, 4, 6, 3), act_fn: str = "silu", diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py index 14db6aeb61..0aadbad9f4 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_qwenimage.py @@ -679,7 +679,7 @@ class AutoencoderKLQwenImage(ModelMixin, AutoencoderMixin, ConfigMixin, FromOrig self, base_dim: int = 96, z_dim: int = 16, - dim_mult: Tuple[int] = [1, 2, 4, 4], + dim_mult: Tuple[int, ...] = (1, 2, 4, 4), num_res_blocks: int = 2, attn_scales: List[float] = [], temperal_downsample: List[bool] = [False, True, True], diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py index ab76254d19..167fdc6bda 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py @@ -31,7 +31,7 @@ class TemporalDecoder(nn.Module): self, in_channels: int = 4, out_channels: int = 3, - block_out_channels: Tuple[int] = (128, 256, 512, 512), + block_out_channels: Tuple[int, ...] = (128, 256, 512, 512), layers_per_block: int = 2, ): super().__init__() @@ -172,8 +172,8 @@ class AutoencoderKLTemporalDecoder(ModelMixin, AutoencoderMixin, ConfigMixin): self, in_channels: int = 3, out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - block_out_channels: Tuple[int] = (64,), + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), layers_per_block: int = 1, latent_channels: int = 4, sample_size: int = 32, diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py index f8bdfeb755..5b4b74543a 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -971,7 +971,7 @@ class AutoencoderKLWan(ModelMixin, AutoencoderMixin, ConfigMixin, FromOriginalMo base_dim: int = 96, decoder_base_dim: Optional[int] = None, z_dim: int = 16, - dim_mult: Tuple[int] = [1, 2, 4, 4], + dim_mult: Tuple[int, ...] = (1, 2, 4, 4), num_res_blocks: int = 2, attn_scales: List[float] = [], temperal_downsample: List[bool] = [False, True, True], diff --git a/src/diffusers/models/controlnets/controlnet_xs.py b/src/diffusers/models/controlnets/controlnet_xs.py index f5c69b9a46..5b9d36ba91 100644 --- a/src/diffusers/models/controlnets/controlnet_xs.py +++ b/src/diffusers/models/controlnets/controlnet_xs.py @@ -293,14 +293,14 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin): self, conditioning_channels: int = 3, conditioning_channel_order: str = "rgb", - conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), time_embedding_mix: float = 1.0, learn_time_embedding: bool = False, num_attention_heads: Union[int, Tuple[int]] = 4, - block_out_channels: Tuple[int] = (4, 8, 16, 16), - base_block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (4, 8, 16, 16), + base_block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), cross_attention_dim: int = 1024, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", @@ -436,7 +436,7 @@ class ControlNetXSAdapter(ModelMixin, ConfigMixin): time_embedding_mix: int = 1.0, conditioning_channels: int = 3, conditioning_channel_order: str = "rgb", - conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), ): r""" Instantiate a [`ControlNetXSAdapter`] from a [`UNet2DConditionModel`]. @@ -529,14 +529,19 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin): self, # unet configs sample_size: Optional[int] = 96, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), norm_num_groups: Optional[int] = 32, cross_attention_dim: Union[int, Tuple[int]] = 1024, transformer_layers_per_block: Union[int, Tuple[int]] = 1, @@ -550,10 +555,10 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin): # additional controlnet configs time_embedding_mix: float = 1.0, ctrl_conditioning_channels: int = 3, - ctrl_conditioning_embedding_out_channels: Tuple[int] = (16, 32, 96, 256), + ctrl_conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256), ctrl_conditioning_channel_order: str = "rgb", ctrl_learn_time_embedding: bool = False, - ctrl_block_out_channels: Tuple[int] = (4, 8, 16, 16), + ctrl_block_out_channels: Tuple[int, ...] = (4, 8, 16, 16), ctrl_num_attention_heads: Union[int, Tuple[int]] = 4, ctrl_max_norm_num_groups: int = 32, ): diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index c564d4e40d..0e82f9742a 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -914,7 +914,7 @@ class HunyuanVideoTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, text_embed_dim: int = 4096, pooled_projection_dim: int = 768, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (16, 56, 56), + rope_axes_dim: Tuple[int, ...] = (16, 56, 56), image_condition_type: Optional[str] = None, ) -> None: super().__init__() diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index 60b40fff3c..601ba0f0b4 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -139,7 +139,7 @@ class HunyuanVideoFramepackTransformer3DModel( text_embed_dim: int = 4096, pooled_projection_dim: int = 768, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (16, 56, 56), + rope_axes_dim: Tuple[int, ...] = (16, 56, 56), image_condition_type: Optional[str] = None, has_image_proj: int = False, image_proj_dim: int = 1152, diff --git a/src/diffusers/models/transformers/transformer_hunyuanimage.py b/src/diffusers/models/transformers/transformer_hunyuanimage.py index 7f37bf815b..bcca9a4cb9 100644 --- a/src/diffusers/models/transformers/transformer_hunyuanimage.py +++ b/src/diffusers/models/transformers/transformer_hunyuanimage.py @@ -689,7 +689,7 @@ class HunyuanImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, text_embed_dim: int = 3584, text_embed_2_dim: Optional[int] = None, rope_theta: float = 256.0, - rope_axes_dim: Tuple[int] = (64, 64), + rope_axes_dim: Tuple[int, ...] = (64, 64), use_meanflow: bool = False, ) -> None: super().__init__() diff --git a/src/diffusers/models/transformers/transformer_skyreels_v2.py b/src/diffusers/models/transformers/transformer_skyreels_v2.py index 6b600aa224..62698b97cc 100644 --- a/src/diffusers/models/transformers/transformer_skyreels_v2.py +++ b/src/diffusers/models/transformers/transformer_skyreels_v2.py @@ -570,7 +570,7 @@ class SkyReelsV2Transformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 16, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/transformers/transformer_wan.py b/src/diffusers/models/transformers/transformer_wan.py index 6f3993eb3f..4d6e8a2dae 100644 --- a/src/diffusers/models/transformers/transformer_wan.py +++ b/src/diffusers/models/transformers/transformer_wan.py @@ -563,7 +563,7 @@ class WanTransformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 40, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/transformers/transformer_wan_vace.py b/src/diffusers/models/transformers/transformer_wan_vace.py index 30c38c244a..1be4f73e33 100644 --- a/src/diffusers/models/transformers/transformer_wan_vace.py +++ b/src/diffusers/models/transformers/transformer_wan_vace.py @@ -182,7 +182,7 @@ class WanVACETransformer3DModel( @register_to_config def __init__( self, - patch_size: Tuple[int] = (1, 2, 2), + patch_size: Tuple[int, ...] = (1, 2, 2), num_attention_heads: int = 40, attention_head_dim: int = 128, in_channels: int = 16, diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py index 4c4c528a59..a027c553ed 100644 --- a/src/diffusers/models/unets/unet_1d.py +++ b/src/diffusers/models/unets/unet_1d.py @@ -86,11 +86,11 @@ class UNet1DModel(ModelMixin, ConfigMixin): flip_sin_to_cos: bool = True, use_timestep_embedding: bool = False, freq_shift: float = 0.0, - down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), - up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), - mid_block_type: Tuple[str] = "UNetMidBlock1D", + down_block_types: Tuple[str, ...] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), + up_block_types: Tuple[str, ...] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), + mid_block_type: str = "UNetMidBlock1D", out_block_type: str = None, - block_out_channels: Tuple[int] = (32, 32, 64), + block_out_channels: Tuple[int, ...] = (32, 32, 64), act_fn: str = None, norm_num_groups: int = 8, layers_per_block: int = 1, diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py index f04d3dfa01..a96d9d8ab0 100644 --- a/src/diffusers/models/unets/unet_2d_condition.py +++ b/src/diffusers/models/unets/unet_2d_condition.py @@ -177,16 +177,21 @@ class UNet2DConditionModel( center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, @@ -486,10 +491,10 @@ class UNet2DConditionModel( def _check_config( self, - down_block_types: Tuple[str], - up_block_types: Tuple[str], + down_block_types: Tuple[str, ...], + up_block_types: Tuple[str, ...], only_cross_attention: Union[bool, Tuple[bool]], - block_out_channels: Tuple[int], + block_out_channels: Tuple[int, ...], layers_per_block: Union[int, Tuple[int]], cross_attention_dim: Union[int, Tuple[int]], transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]], diff --git a/src/diffusers/models/unets/unet_kandinsky3.py b/src/diffusers/models/unets/unet_kandinsky3.py index 27241ce2e6..7c7948c4fd 100644 --- a/src/diffusers/models/unets/unet_kandinsky3.py +++ b/src/diffusers/models/unets/unet_kandinsky3.py @@ -54,7 +54,7 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin): groups: int = 32, attention_head_dim: int = 64, layers_per_block: Union[int, Tuple[int]] = 3, - block_out_channels: Tuple[int] = (384, 768, 1536, 3072), + block_out_channels: Tuple[int, ...] = (384, 768, 1536, 3072), cross_attention_dim: Union[int, Tuple[int]] = 4096, encoder_hid_dim: int = 4096, ): diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py index 059a6e807c..63e9bee409 100644 --- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py +++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py @@ -73,25 +73,25 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL sample_size: Optional[int] = None, in_channels: int = 8, out_channels: int = 4, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal", ), - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", ), - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), addition_time_embed_dim: int = 256, projection_class_embeddings_input_dim: int = 768, layers_per_block: Union[int, Tuple[int]] = 2, cross_attention_dim: Union[int, Tuple[int]] = 1024, transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1, - num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20), + num_attention_heads: Union[int, Tuple[int, ...]] = (5, 10, 20, 20), num_frames: int = 25, ): super().__init__() diff --git a/src/diffusers/models/unets/unet_stable_cascade.py b/src/diffusers/models/unets/unet_stable_cascade.py index e79ce8ac1d..23d358c1bf 100644 --- a/src/diffusers/models/unets/unet_stable_cascade.py +++ b/src/diffusers/models/unets/unet_stable_cascade.py @@ -145,10 +145,10 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): timestep_ratio_embedding_dim: int = 64, patch_size: int = 1, conditioning_dim: int = 2048, - block_out_channels: Tuple[int] = (2048, 2048), - num_attention_heads: Tuple[int] = (32, 32), - down_num_layers_per_block: Tuple[int] = (8, 24), - up_num_layers_per_block: Tuple[int] = (24, 8), + block_out_channels: Tuple[int, ...] = (2048, 2048), + num_attention_heads: Tuple[int, ...] = (32, 32), + down_num_layers_per_block: Tuple[int, ...] = (8, 24), + up_num_layers_per_block: Tuple[int, ...] = (24, 8), down_blocks_repeat_mappers: Optional[Tuple[int]] = ( 1, 1, @@ -167,7 +167,7 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalModelMixin): kernel_size=3, dropout: Union[float, Tuple[float]] = (0.1, 0.1), self_attn: Union[bool, Tuple[bool]] = True, - timestep_conditioning_type: Tuple[str] = ("sca", "crp"), + timestep_conditioning_type: Tuple[str, ...] = ("sca", "crp"), switch_level: Optional[Tuple[bool]] = None, ): """ diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py index 13653b9037..5aad386a89 100644 --- a/src/diffusers/models/vae_flax.py +++ b/src/diffusers/models/vae_flax.py @@ -532,8 +532,8 @@ class FlaxEncoder(nn.Module): in_channels: int = 3 out_channels: int = 3 - down_block_types: Tuple[str] = ("DownEncoderBlock2D",) - block_out_channels: Tuple[int] = (64,) + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 2 norm_num_groups: int = 32 act_fn: str = "silu" @@ -650,8 +650,8 @@ class FlaxDecoder(nn.Module): in_channels: int = 3 out_channels: int = 3 - up_block_types: Tuple[str] = ("UpDecoderBlock2D",) - block_out_channels: int = (64,) + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 2 norm_num_groups: int = 32 act_fn: str = "silu" @@ -823,9 +823,9 @@ class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin): in_channels: int = 3 out_channels: int = 3 - down_block_types: Tuple[str] = ("DownEncoderBlock2D",) - up_block_types: Tuple[str] = ("UpDecoderBlock2D",) - block_out_channels: Tuple[int] = (64,) + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",) + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",) + block_out_channels: Tuple[int, ...] = (64,) layers_per_block: int = 1 act_fn: str = "silu" latent_channels: int = 4 diff --git a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py index b6b40cd6e6..1749300b35 100644 --- a/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py +++ b/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py @@ -245,16 +245,21 @@ class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoad out_channels: int = 4, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ), mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", - up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py index 397fbc0d85..7c25713cd1 100644 --- a/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py @@ -374,21 +374,21 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): center_input_sample: bool = False, flip_sin_to_cos: bool = True, freq_shift: int = 0, - down_block_types: Tuple[str] = ( + down_block_types: Tuple[str, ...] = ( "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat", ), mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn", - up_block_types: Tuple[str] = ( + up_block_types: Tuple[str, ...] = ( "UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", ), only_cross_attention: Union[bool, Tuple[bool]] = False, - block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), layers_per_block: Union[int, Tuple[int]] = 2, downsample_padding: int = 1, mid_block_scale_factor: float = 1, diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py index b268eae806..d1d05c8945 100644 --- a/src/diffusers/pipelines/shap_e/renderer.py +++ b/src/diffusers/pipelines/shap_e/renderer.py @@ -742,7 +742,7 @@ class ShapEParamsProjModel(ModelMixin, ConfigMixin): def __init__( self, *, - param_names: Tuple[str] = ( + param_names: Tuple[str, ...] = ( "nerstf.mlp.0.weight", "nerstf.mlp.1.weight", "nerstf.mlp.2.weight", @@ -786,13 +786,13 @@ class ShapERenderer(ModelMixin, ConfigMixin): def __init__( self, *, - param_names: Tuple[str] = ( + param_names: Tuple[str, ...] = ( "nerstf.mlp.0.weight", "nerstf.mlp.1.weight", "nerstf.mlp.2.weight", "nerstf.mlp.3.weight", ), - param_shapes: Tuple[Tuple[int]] = ( + param_shapes: Tuple[Tuple[int, int], ...] = ( (256, 93), (256, 256), (256, 256), @@ -804,7 +804,7 @@ class ShapERenderer(ModelMixin, ConfigMixin): n_hidden_layers: int = 6, act_fn: str = "swish", insert_direction_at: int = 4, - background: Tuple[float] = ( + background: Tuple[float, ...] = ( 255.0, 255.0, 255.0,