From 550eca353041ef3b54170f16f35e0f1e68588a42 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Tue, 6 Jan 2026 09:14:38 +0530 Subject: [PATCH] use export util funcs. --- scripts/ltx2_test_full_pipeline.py | 109 +------------------------ scripts/ltx2_test_full_pipeline_i2v.py | 109 +------------------------ 2 files changed, 2 insertions(+), 216 deletions(-) diff --git a/scripts/ltx2_test_full_pipeline.py b/scripts/ltx2_test_full_pipeline.py index 16ea9f8040..20eb65fde2 100644 --- a/scripts/ltx2_test_full_pipeline.py +++ b/scripts/ltx2_test_full_pipeline.py @@ -1,117 +1,10 @@ import argparse import os -from fractions import Fraction -from typing import Optional -import av # Needs to be installed separately (`pip install av`) import torch from diffusers import LTX2Pipeline - - -# Video export functions copied from original LTX 2.0 code -def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream: - """ - Prepare the audio stream for writing. - """ - audio_stream = container.add_stream("aac", rate=audio_sample_rate) - audio_stream.codec_context.sample_rate = audio_sample_rate - audio_stream.codec_context.layout = "stereo" - audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate) - return audio_stream - - -def _resample_audio( - container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame -) -> None: - cc = audio_stream.codec_context - - # Use the encoder's format/layout/rate as the *target* - target_format = cc.format or "fltp" # AAC → usually fltp - target_layout = cc.layout or "stereo" - target_rate = cc.sample_rate or frame_in.sample_rate - - audio_resampler = av.audio.resampler.AudioResampler( - format=target_format, - layout=target_layout, - rate=target_rate, - ) - - audio_next_pts = 0 - for rframe in audio_resampler.resample(frame_in): - if rframe.pts is None: - rframe.pts = audio_next_pts - audio_next_pts += rframe.samples - rframe.sample_rate = frame_in.sample_rate - container.mux(audio_stream.encode(rframe)) - - # flush audio encoder - for packet in audio_stream.encode(): - container.mux(packet) - - -def _write_audio( - container: av.container.Container, - audio_stream: av.audio.AudioStream, - samples: torch.Tensor, - audio_sample_rate: int, -) -> None: - if samples.ndim == 1: - samples = samples[:, None] - - if samples.shape[1] != 2 and samples.shape[0] == 2: - samples = samples.T - - if samples.shape[1] != 2: - raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.") - - # Convert to int16 packed for ingestion; resampler converts to encoder fmt. - if samples.dtype != torch.int16: - samples = torch.clip(samples, -1.0, 1.0) - samples = (samples * 32767.0).to(torch.int16) - - frame_in = av.AudioFrame.from_ndarray( - samples.contiguous().reshape(1, -1).cpu().numpy(), - format="s16", - layout="stereo", - ) - frame_in.sample_rate = audio_sample_rate - - _resample_audio(container, audio_stream, frame_in) - - -def encode_video( - video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str -) -> None: - video_np = video.cpu().numpy() - - _, height, width, _ = video_np.shape - - container = av.open(output_path, mode="w") - stream = container.add_stream("libx264", rate=int(fps)) - stream.width = width - stream.height = height - stream.pix_fmt = "yuv420p" - - if audio is not None: - if audio_sample_rate is None: - raise ValueError("audio_sample_rate is required when audio is provided") - - audio_stream = _prepare_audio_stream(container, audio_sample_rate) - - for frame_array in video_np: - frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24") - for packet in stream.encode(frame): - container.mux(packet) - - # Flush encoder - for packet in stream.encode(): - container.mux(packet) - - if audio is not None: - _write_audio(container, audio_stream, audio, audio_sample_rate) - - container.close() +from diffusers.pipelines.ltx2.export_utils import encode_video def parse_args(): diff --git a/scripts/ltx2_test_full_pipeline_i2v.py b/scripts/ltx2_test_full_pipeline_i2v.py index 8c39647eae..f99d8c135e 100644 --- a/scripts/ltx2_test_full_pipeline_i2v.py +++ b/scripts/ltx2_test_full_pipeline_i2v.py @@ -1,118 +1,11 @@ import argparse import os -from fractions import Fraction -from typing import Optional -import av # Needs to be installed separately (`pip install av`) import torch from PIL import Image from diffusers.pipelines.ltx2 import LTX2ImageToVideoPipeline - - -# Video export functions copied from original LTX 2.0 code -def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream: - """ - Prepare the audio stream for writing. - """ - audio_stream = container.add_stream("aac", rate=audio_sample_rate) - audio_stream.codec_context.sample_rate = audio_sample_rate - audio_stream.codec_context.layout = "stereo" - audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate) - return audio_stream - - -def _resample_audio( - container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame -) -> None: - cc = audio_stream.codec_context - - # Use the encoder's format/layout/rate as the *target* - target_format = cc.format or "fltp" # AAC → usually fltp - target_layout = cc.layout or "stereo" - target_rate = cc.sample_rate or frame_in.sample_rate - - audio_resampler = av.audio.resampler.AudioResampler( - format=target_format, - layout=target_layout, - rate=target_rate, - ) - - audio_next_pts = 0 - for rframe in audio_resampler.resample(frame_in): - if rframe.pts is None: - rframe.pts = audio_next_pts - audio_next_pts += rframe.samples - rframe.sample_rate = frame_in.sample_rate - container.mux(audio_stream.encode(rframe)) - - # flush audio encoder - for packet in audio_stream.encode(): - container.mux(packet) - - -def _write_audio( - container: av.container.Container, - audio_stream: av.audio.AudioStream, - samples: torch.Tensor, - audio_sample_rate: int, -) -> None: - if samples.ndim == 1: - samples = samples[:, None] - - if samples.shape[1] != 2 and samples.shape[0] == 2: - samples = samples.T - - if samples.shape[1] != 2: - raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.") - - # Convert to int16 packed for ingestion; resampler converts to encoder fmt. - if samples.dtype != torch.int16: - samples = torch.clip(samples, -1.0, 1.0) - samples = (samples * 32767.0).to(torch.int16) - - frame_in = av.AudioFrame.from_ndarray( - samples.contiguous().reshape(1, -1).cpu().numpy(), - format="s16", - layout="stereo", - ) - frame_in.sample_rate = audio_sample_rate - - _resample_audio(container, audio_stream, frame_in) - - -def encode_video( - video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str -) -> None: - video_np = video.cpu().numpy() - - _, height, width, _ = video_np.shape - - container = av.open(output_path, mode="w") - stream = container.add_stream("libx264", rate=int(fps)) - stream.width = width - stream.height = height - stream.pix_fmt = "yuv420p" - - if audio is not None: - if audio_sample_rate is None: - raise ValueError("audio_sample_rate is required when audio is provided") - - audio_stream = _prepare_audio_stream(container, audio_sample_rate) - - for frame_array in video_np: - frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24") - for packet in stream.encode(frame): - container.mux(packet) - - # Flush encoder - for packet in stream.encode(): - container.mux(packet) - - if audio is not None: - _write_audio(container, audio_stream, audio, audio_sample_rate) - - container.close() +from diffusers.pipelines.ltx2.export_utils import encode_video def parse_args():