From 059999a3f7ad3fe3077f61812e3b3de91136f4bb Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 22 Dec 2025 10:24:55 +0000 Subject: [PATCH] up --- scripts/log.txt | 32 ------------------- .../autoencoders/autoencoder_kl_ltx2_audio.py | 22 +++++++------ 2 files changed, 12 insertions(+), 42 deletions(-) delete mode 100644 scripts/log.txt diff --git a/scripts/log.txt b/scripts/log.txt deleted file mode 100644 index aa3046d42a..0000000000 --- a/scripts/log.txt +++ /dev/null @@ -1,32 +0,0 @@ -ddconfig={'double_z': True, 'mel_bins': 64, 'z_channels': 8, 'resolution': 256, 'downsample_time': False, 'in_channels': 2, 'out_ch': 2, 'ch': 128, 'ch_mult': [1, 2, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0, 'mid_block_add_attention': False, 'norm_type': 'pixel', 'causality_axis': 'height'}, sample_rate=16000, mel_hop_length=160, is_causal=True, mel_bins=64 -mid_block_add_attention=False, attn_resolutions=[] -k='mid.block_1.conv1.conv.weight' -k='mid.block_1.conv1.conv.bias' -k='mid.block_1.conv2.conv.weight' -k='mid.block_1.conv2.conv.bias' -k='mid.block_2.conv1.conv.weight' -k='mid.block_2.conv1.conv.bias' -k='mid.block_2.conv2.conv.weight' -k='mid.block_2.conv2.conv.bias' -Traceback (most recent call last): - File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 97, in - main() - File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context - return func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 85, in main - original_out = original_decoder(dummy) - ^^^^^^^^^^^^^^^^^^^^^^^ - File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py", line 206, in forward - sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/ops.py", line 27, in un_normalize - return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x) - ~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -RuntimeError: The size of tensor a (512) must match the size of tensor b (128) at non-singleton dimension 2 diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py index e7960c3e14..1385b414b9 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py @@ -99,8 +99,9 @@ class LTX2AudioCausalConv2d(nn.Module): super().__init__() self.causality_axis = causality_axis - kernel_size = nn.modules.utils._pair(kernel_size) - dilation = nn.modules.utils._pair(dilation) + kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size + dilation = (dilation, dilation) if isinstance(dilation, int) else dilation + pad_h = (kernel_size[0] - 1) * dilation[0] pad_w = (kernel_size[1] - 1) * dilation[1] @@ -232,7 +233,7 @@ class LTX2AudioResnetBlock(nn.Module): def forward( self, x: torch.Tensor, - temb: Optional[torch.Tensor] = None, + temb: Optional[torch.Tensor] = None ) -> torch.Tensor: h = self.norm1(x) h = self.non_linearity(h) @@ -257,7 +258,7 @@ class LTX2AudioUpsample(nn.Module): self, in_channels: int, with_conv: bool, - causality_axis: Optional[str] = "height", + causality_axis: Optional[str] = "height" ) -> None: super().__init__() self.with_conv = with_conv @@ -291,10 +292,11 @@ class LTX2AudioPerChannelStatistics(nn.Module): def __init__(self, latent_channels: int = 128) -> None: super().__init__() + # Sayak notes: `empty` always causes problems in CI. Should we consider using `torch.ones`? self.register_buffer("std-of-means", torch.empty(latent_channels)) self.register_buffer("mean-of-means", torch.empty(latent_channels)) - def un_normalize(self, x: torch.Tensor) -> torch.Tensor: + def denormalize(self, x: torch.Tensor) -> torch.Tensor: return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x) def normalize(self, x: torch.Tensor) -> torch.Tensor: @@ -327,7 +329,7 @@ class LTX2AudioAudioPatchifier: def unpatchify( self, audio_latents: torch.Tensor, - output_shape: AudioLatentShape, + output_shape: AudioLatentShape ) -> torch.Tensor: batch, time, _ = audio_latents.shape channels = output_shape.channels @@ -421,7 +423,7 @@ class LTX2AudioDecoder(nn.Module): def _adjust_output_shape( self, decoded_output: torch.Tensor, - target_shape: AudioLatentShape, + target_shape: AudioLatentShape ) -> torch.Tensor: _, _, current_time, current_freq = decoded_output.shape target_channels = target_shape.channels @@ -460,7 +462,7 @@ class LTX2AudioDecoder(nn.Module): ) sample_patched = self.patchifier.patchify(sample) - sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched) + sample_denormalized = self.per_channel_statistics.denormalize(sample_patched) sample = self.patchifier.unpatchify(sample_denormalized, latent_shape) target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR @@ -509,7 +511,7 @@ class LTX2AudioDecoder(nn.Module): self, initial_block_channels: int, dropout: float, - resamp_with_conv: bool, + resamp_with_conv: bool ) -> tuple[nn.ModuleList, int]: up_modules = nn.ModuleList() block_in = initial_block_channels @@ -630,7 +632,7 @@ class AutoencoderKLLTX2Audio(ModelMixin, AutoencoderMixin, ConfigMixin): def encode( self, x: torch.Tensor, - return_dict: bool = True, + return_dict: bool = True ): raise NotImplementedError("AutoencoderKLLTX2Audio does not implement encoding.")