up

2026-01-27 17:22:53 +03:00 · 2025-12-22 10:24:55 +00:00
parent 58257eb0e0
commit 059999a3f7
2 changed files with 12 additions and 42 deletions
--- a/scripts/log.txt
+++ b/scripts/log.txt
@@ -1,32 +0,0 @@
-ddconfig={'double_z': True, 'mel_bins': 64, 'z_channels': 8, 'resolution': 256, 'downsample_time': False, 'in_channels': 2, 'out_ch': 2, 'ch': 128, 'ch_mult': [1, 2, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0, 'mid_block_add_attention': False, 'norm_type': 'pixel', 'causality_axis': 'height'}, sample_rate=16000, mel_hop_length=160, is_causal=True, mel_bins=64
-mid_block_add_attention=False, attn_resolutions=[]
-k='mid.block_1.conv1.conv.weight'
-k='mid.block_1.conv1.conv.bias'
-k='mid.block_1.conv2.conv.weight'
-k='mid.block_1.conv2.conv.bias'
-k='mid.block_2.conv1.conv.weight'
-k='mid.block_2.conv1.conv.bias'
-k='mid.block_2.conv2.conv.weight'
-k='mid.block_2.conv2.conv.bias'
-Traceback (most recent call last):
-  File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 97, in <module>
-    main()
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
-    return func(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 85, in main
-    original_out = original_decoder(dummy)
-                   ^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
-    return self._call_impl(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
-    return forward_call(*args, **kwargs)
-           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py", line 206, in forward
-    sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
-                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-  File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/ops.py", line 27, in un_normalize
-    return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
-            ~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-RuntimeError: The size of tensor a (512) must match the size of tensor b (128) at non-singleton dimension 2
--- a/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_ltx2_audio.py
@@ -99,8 +99,9 @@ class LTX2AudioCausalConv2d(nn.Module):
        super().__init__()

        self.causality_axis = causality_axis
-        kernel_size = nn.modules.utils._pair(kernel_size)
-        dilation = nn.modules.utils._pair(dilation)
+        kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
+

        pad_h = (kernel_size[0] - 1) * dilation[0]
        pad_w = (kernel_size[1] - 1) * dilation[1]
@@ -232,7 +233,7 @@ class LTX2AudioResnetBlock(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        h = self.norm1(x)
        h = self.non_linearity(h)
@@ -257,7 +258,7 @@ class LTX2AudioUpsample(nn.Module):
        self,
        in_channels: int,
        with_conv: bool,
-        causality_axis: Optional[str] = "height",
+        causality_axis: Optional[str] = "height"
    ) -> None:
        super().__init__()
        self.with_conv = with_conv
@@ -291,10 +292,11 @@ class LTX2AudioPerChannelStatistics(nn.Module):

    def __init__(self, latent_channels: int = 128) -> None:
        super().__init__()
+        # Sayak notes: `empty` always causes problems in CI. Should we consider using `torch.ones`?
        self.register_buffer("std-of-means", torch.empty(latent_channels))
        self.register_buffer("mean-of-means", torch.empty(latent_channels))

-    def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
+    def denormalize(self, x: torch.Tensor) -> torch.Tensor:
        return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)

    def normalize(self, x: torch.Tensor) -> torch.Tensor:
@@ -327,7 +329,7 @@ class LTX2AudioAudioPatchifier:
    def unpatchify(
        self,
        audio_latents: torch.Tensor,
-        output_shape: AudioLatentShape,
+        output_shape: AudioLatentShape
    ) -> torch.Tensor:
        batch, time, _ = audio_latents.shape
        channels = output_shape.channels
@@ -421,7 +423,7 @@ class LTX2AudioDecoder(nn.Module):
    def _adjust_output_shape(
        self,
        decoded_output: torch.Tensor,
-        target_shape: AudioLatentShape,
+        target_shape: AudioLatentShape
    ) -> torch.Tensor:
        _, _, current_time, current_freq = decoded_output.shape
        target_channels = target_shape.channels
@@ -460,7 +462,7 @@ class LTX2AudioDecoder(nn.Module):
        )

        sample_patched = self.patchifier.patchify(sample)
-        sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
+        sample_denormalized = self.per_channel_statistics.denormalize(sample_patched)
        sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)

        target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
@@ -509,7 +511,7 @@ class LTX2AudioDecoder(nn.Module):
        self,
        initial_block_channels: int,
        dropout: float,
-        resamp_with_conv: bool,
+        resamp_with_conv: bool
    ) -> tuple[nn.ModuleList, int]:
        up_modules = nn.ModuleList()
        block_in = initial_block_channels
@@ -630,7 +632,7 @@ class AutoencoderKLLTX2Audio(ModelMixin, AutoencoderMixin, ConfigMixin):
    def encode(
        self,
        x: torch.Tensor,
-        return_dict: bool = True,
+        return_dict: bool = True
    ):
        raise NotImplementedError("AutoencoderKLLTX2Audio does not implement encoding.")