diff --git a/scripts/ltx2_test_full_pipeline.py b/scripts/ltx2_test_full_pipeline.py index 019bbda46d..1907be2da8 100644 --- a/scripts/ltx2_test_full_pipeline.py +++ b/scripts/ltx2_test_full_pipeline.py @@ -191,6 +191,7 @@ def main(args): guidance_scale=args.guidance_scale, generator=torch.Generator(device=args.device).manual_seed(args.seed), output_type="np", + return_dict=False, ) # Convert video to uint8 (but keep as NumPy array) diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py index eff87c08a3..e8a41050f5 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py @@ -1033,7 +1033,7 @@ class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMix # NOTE: currently, unlike the video VAE, we denormalize the audio latents inside the audio VAE decoder's # decode method generated_mel_spectrograms = self.audio_vae.decode(audio_latents, return_dict=False)[0] - waveforms = self.vocoder(generated_mel_spectrograms) + audio = self.vocoder(generated_mel_spectrograms) # Offload all models self.maybe_free_model_hooks() @@ -1041,4 +1041,4 @@ class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMix if not return_dict: return (video, audio) - return LTX2PipelineOutput(frames=video, audio=waveforms) + return LTX2PipelineOutput(frames=video, audio=audio)