From 059999a3f7ad3fe3077f61812e3b3de91136f4bb Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 22 Dec 2025 10:24:55 +0000
Subject: [PATCH] up

---
 scripts/log.txt                               | 32 -------------------
 .../autoencoders/autoencoder_kl_ltx2_audio.py | 22 +++++++------
 2 files changed, 12 insertions(+), 42 deletions(-)
 delete mode 100644 scripts/log.txt
diff --git a/scripts/log.txt b/scripts/log.txt
deleted file mode 100644
index aa3046d42a..0000000000
--- a/scripts/log.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-ddconfig={'double_z': True, 'mel_bins': 64, 'z_channels': 8, 'resolution': 256, 'downsample_time': False, 'in_channels': 2, 'out_ch': 2, 'ch': 128, 'ch_mult': [1, 2, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0, 'mid_block_add_attention': False, 'norm_type': 'pixel', 'causality_axis': 'height'}, sample_rate=16000, mel_hop_length=160, is_causal=True, mel_bins=64
-mid_block_add_attention=False, attn_resolutions=[]
-k='mid.block_1.conv1.conv.weight'
-k='mid.block_1.conv1.conv.bias'
-k='mid.block_1.conv2.conv.weight'
-k='mid.block_1.conv2.conv.bias'
-k='mid.block_2.conv1.conv.weight'
-k='mid.block_2.conv1.conv.bias'
-k='mid.block_2.conv2.conv.weight'
-k='mid.block_2.conv2.conv.bias'
-Traceback (most recent call last):
-  File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 97, in <module>
-    main()
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
-    return func(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 85, in main
-    original_out = original_decoder(dummy)
-                   ^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
-    return self._call_impl(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
-    return forward_call(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py", line 206, in forward
-    sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
-                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/ops.py", line 27, in un_normalize
-    return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
-            ~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-RuntimeError: The size of tensor a (512) must match the size of tensor b (128) at non-singleton dimension 2
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py
index e7960c3e14..1385b414b9 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py
@@ -99,8 +99,9 @@ class LTX2AudioCausalConv2d(nn.Module):
         super().__init__()
 
         self.causality_axis = causality_axis
-        kernel_size = nn.modules.utils._pair(kernel_size)
-        dilation = nn.modules.utils._pair(dilation)
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
+
 
         pad_h = (kernel_size[0] - 1) * dilation[0]
         pad_w = (kernel_size[1] - 1) * dilation[1]
@@ -232,7 +233,7 @@ class LTX2AudioResnetBlock(nn.Module):
     def forward(
         self,
         x: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         h = self.norm1(x)
         h = self.non_linearity(h)
@@ -257,7 +258,7 @@ class LTX2AudioUpsample(nn.Module):
         self,
         in_channels: int,
         with_conv: bool,
-        causality_axis: Optional[str] = "height",
+        causality_axis: Optional[str] = "height"
     ) -> None:
         super().__init__()
         self.with_conv = with_conv
@@ -291,10 +292,11 @@ class LTX2AudioPerChannelStatistics(nn.Module):
 
     def __init__(self, latent_channels: int = 128) -> None:
         super().__init__()
+        # Sayak notes: `empty` always causes problems in CI. Should we consider using `torch.ones`?
         self.register_buffer("std-of-means", torch.empty(latent_channels))
         self.register_buffer("mean-of-means", torch.empty(latent_channels))
 
-    def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
+    def denormalize(self, x: torch.Tensor) -> torch.Tensor:
         return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
 
     def normalize(self, x: torch.Tensor) -> torch.Tensor:
@@ -327,7 +329,7 @@ class LTX2AudioAudioPatchifier:
     def unpatchify(
         self,
         audio_latents: torch.Tensor,
-        output_shape: AudioLatentShape,
+        output_shape: AudioLatentShape
     ) -> torch.Tensor:
         batch, time, _ = audio_latents.shape
         channels = output_shape.channels
@@ -421,7 +423,7 @@ class LTX2AudioDecoder(nn.Module):
     def _adjust_output_shape(
         self,
         decoded_output: torch.Tensor,
-        target_shape: AudioLatentShape,
+        target_shape: AudioLatentShape
     ) -> torch.Tensor:
         _, _, current_time, current_freq = decoded_output.shape
         target_channels = target_shape.channels
@@ -460,7 +462,7 @@ class LTX2AudioDecoder(nn.Module):
         )
 
         sample_patched = self.patchifier.patchify(sample)
-        sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
+        sample_denormalized = self.per_channel_statistics.denormalize(sample_patched)
         sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)
 
         target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
@@ -509,7 +511,7 @@ class LTX2AudioDecoder(nn.Module):
         self,
         initial_block_channels: int,
         dropout: float,
-        resamp_with_conv: bool,
+        resamp_with_conv: bool
     ) -> tuple[nn.ModuleList, int]:
         up_modules = nn.ModuleList()
         block_in = initial_block_channels
@@ -630,7 +632,7 @@ class AutoencoderKLLTX2Audio(ModelMixin, AutoencoderMixin, ConfigMixin):
     def encode(
         self,
         x: torch.Tensor,
-        return_dict: bool = True,
+        return_dict: bool = True
     ):
         raise NotImplementedError("AutoencoderKLLTX2Audio does not implement encoding.")