mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
up
This commit is contained in:
@@ -1,32 +0,0 @@
|
||||
ddconfig={'double_z': True, 'mel_bins': 64, 'z_channels': 8, 'resolution': 256, 'downsample_time': False, 'in_channels': 2, 'out_ch': 2, 'ch': 128, 'ch_mult': [1, 2, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0, 'mid_block_add_attention': False, 'norm_type': 'pixel', 'causality_axis': 'height'}, sample_rate=16000, mel_hop_length=160, is_causal=True, mel_bins=64
|
||||
mid_block_add_attention=False, attn_resolutions=[]
|
||||
k='mid.block_1.conv1.conv.weight'
|
||||
k='mid.block_1.conv1.conv.bias'
|
||||
k='mid.block_1.conv2.conv.weight'
|
||||
k='mid.block_1.conv2.conv.bias'
|
||||
k='mid.block_2.conv1.conv.weight'
|
||||
k='mid.block_2.conv1.conv.bias'
|
||||
k='mid.block_2.conv2.conv.weight'
|
||||
k='mid.block_2.conv2.conv.bias'
|
||||
Traceback (most recent call last):
|
||||
File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 97, in <module>
|
||||
main()
|
||||
File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
|
||||
return func(*args, **kwargs)
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/fsx/sayak/diffusers-new-model-addition-ltx2/scripts/test_ltx2_audio_conversion.py", line 85, in main
|
||||
original_out = original_decoder(dummy)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
|
||||
return self._call_impl(*args, **kwargs)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/fsx/sayak/miniconda3/envs/diffusers/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
|
||||
return forward_call(*args, **kwargs)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py", line 206, in forward
|
||||
sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
File "/fsx/sayak/ltx-2/ltx-core/src/ltx_core/model/audio_vae/ops.py", line 27, in un_normalize
|
||||
return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
|
||||
~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
RuntimeError: The size of tensor a (512) must match the size of tensor b (128) at non-singleton dimension 2
|
||||
@@ -99,8 +99,9 @@ class LTX2AudioCausalConv2d(nn.Module):
|
||||
super().__init__()
|
||||
|
||||
self.causality_axis = causality_axis
|
||||
kernel_size = nn.modules.utils._pair(kernel_size)
|
||||
dilation = nn.modules.utils._pair(dilation)
|
||||
kernel_size = (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
|
||||
dilation = (dilation, dilation) if isinstance(dilation, int) else dilation
|
||||
|
||||
|
||||
pad_h = (kernel_size[0] - 1) * dilation[0]
|
||||
pad_w = (kernel_size[1] - 1) * dilation[1]
|
||||
@@ -232,7 +233,7 @@ class LTX2AudioResnetBlock(nn.Module):
|
||||
def forward(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
temb: Optional[torch.Tensor] = None,
|
||||
temb: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
h = self.norm1(x)
|
||||
h = self.non_linearity(h)
|
||||
@@ -257,7 +258,7 @@ class LTX2AudioUpsample(nn.Module):
|
||||
self,
|
||||
in_channels: int,
|
||||
with_conv: bool,
|
||||
causality_axis: Optional[str] = "height",
|
||||
causality_axis: Optional[str] = "height"
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.with_conv = with_conv
|
||||
@@ -291,10 +292,11 @@ class LTX2AudioPerChannelStatistics(nn.Module):
|
||||
|
||||
def __init__(self, latent_channels: int = 128) -> None:
|
||||
super().__init__()
|
||||
# Sayak notes: `empty` always causes problems in CI. Should we consider using `torch.ones`?
|
||||
self.register_buffer("std-of-means", torch.empty(latent_channels))
|
||||
self.register_buffer("mean-of-means", torch.empty(latent_channels))
|
||||
|
||||
def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
|
||||
def denormalize(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
|
||||
|
||||
def normalize(self, x: torch.Tensor) -> torch.Tensor:
|
||||
@@ -327,7 +329,7 @@ class LTX2AudioAudioPatchifier:
|
||||
def unpatchify(
|
||||
self,
|
||||
audio_latents: torch.Tensor,
|
||||
output_shape: AudioLatentShape,
|
||||
output_shape: AudioLatentShape
|
||||
) -> torch.Tensor:
|
||||
batch, time, _ = audio_latents.shape
|
||||
channels = output_shape.channels
|
||||
@@ -421,7 +423,7 @@ class LTX2AudioDecoder(nn.Module):
|
||||
def _adjust_output_shape(
|
||||
self,
|
||||
decoded_output: torch.Tensor,
|
||||
target_shape: AudioLatentShape,
|
||||
target_shape: AudioLatentShape
|
||||
) -> torch.Tensor:
|
||||
_, _, current_time, current_freq = decoded_output.shape
|
||||
target_channels = target_shape.channels
|
||||
@@ -460,7 +462,7 @@ class LTX2AudioDecoder(nn.Module):
|
||||
)
|
||||
|
||||
sample_patched = self.patchifier.patchify(sample)
|
||||
sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
|
||||
sample_denormalized = self.per_channel_statistics.denormalize(sample_patched)
|
||||
sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)
|
||||
|
||||
target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
|
||||
@@ -509,7 +511,7 @@ class LTX2AudioDecoder(nn.Module):
|
||||
self,
|
||||
initial_block_channels: int,
|
||||
dropout: float,
|
||||
resamp_with_conv: bool,
|
||||
resamp_with_conv: bool
|
||||
) -> tuple[nn.ModuleList, int]:
|
||||
up_modules = nn.ModuleList()
|
||||
block_in = initial_block_channels
|
||||
@@ -630,7 +632,7 @@ class AutoencoderKLLTX2Audio(ModelMixin, AutoencoderMixin, ConfigMixin):
|
||||
def encode(
|
||||
self,
|
||||
x: torch.Tensor,
|
||||
return_dict: bool = True,
|
||||
return_dict: bool = True
|
||||
):
|
||||
raise NotImplementedError("AutoencoderKLLTX2Audio does not implement encoding.")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user