mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
Fix bug when calculating audio RoPE coords
This commit is contained in:
@@ -761,11 +761,9 @@ class LTX2AudioVideoRotaryPosEmbed(nn.Module):
|
||||
"""
|
||||
|
||||
# 1. Generate coordinates in the frame (time) dimension.
|
||||
audio_duration_s = num_frames / fps
|
||||
latent_frames = int(audio_duration_s * self.audio_latents_per_second)
|
||||
# Always compute rope in fp32
|
||||
grid_f = torch.arange(
|
||||
start=shift, end=latent_frames + shift, step=self.patch_size_t, dtype=torch.float32, device=device
|
||||
start=shift, end=num_frames + shift, step=self.patch_size_t, dtype=torch.float32, device=device
|
||||
)
|
||||
|
||||
# 2. Calculate start timstamps in seconds with respect to the original spectrogram grid
|
||||
|
||||
@@ -689,7 +689,7 @@ class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMix
|
||||
latents_per_second = (
|
||||
float(sampling_rate) / float(hop_length) / float(self.audio_vae_temporal_compression_ratio)
|
||||
)
|
||||
latent_length = int(duration_s * latents_per_second)
|
||||
latent_length = round(duration_s * latents_per_second)
|
||||
|
||||
if latents is not None:
|
||||
return latents.to(device=device, dtype=dtype), latent_length
|
||||
|
||||
@@ -749,7 +749,7 @@ class LTX2ImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoL
|
||||
latents_per_second = (
|
||||
float(sampling_rate) / float(hop_length) / float(self.audio_vae_temporal_compression_ratio)
|
||||
)
|
||||
latent_length = int(duration_s * latents_per_second)
|
||||
latent_length = round(duration_s * latents_per_second)
|
||||
|
||||
if latents is not None:
|
||||
return latents.to(device=device, dtype=dtype), latent_length
|
||||
|
||||
Reference in New Issue
Block a user