use export util funcs.

2026-01-27 17:22:53 +03:00 · 2026-01-06 09:14:38 +05:30
parent c039c87b99
commit 550eca3530
2 changed files with 2 additions and 216 deletions
--- a/scripts/ltx2_test_full_pipeline.py
+++ b/scripts/ltx2_test_full_pipeline.py
@@ -1,117 +1,10 @@
 import argparse
 import os
-from fractions import Fraction
-from typing import Optional

-import av  # Needs to be installed separately (`pip install av`)
 import torch

 from diffusers import LTX2Pipeline
-
-
-# Video export functions copied from original LTX 2.0 code
-def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream:
-    """
-    Prepare the audio stream for writing.
-    """
-    audio_stream = container.add_stream("aac", rate=audio_sample_rate)
-    audio_stream.codec_context.sample_rate = audio_sample_rate
-    audio_stream.codec_context.layout = "stereo"
-    audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
-    return audio_stream
-
-
-def _resample_audio(
-    container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame
-) -> None:
-    cc = audio_stream.codec_context
-
-    # Use the encoder's format/layout/rate as the *target*
-    target_format = cc.format or "fltp"  # AAC → usually fltp
-    target_layout = cc.layout or "stereo"
-    target_rate = cc.sample_rate or frame_in.sample_rate
-
-    audio_resampler = av.audio.resampler.AudioResampler(
-        format=target_format,
-        layout=target_layout,
-        rate=target_rate,
-    )
-
-    audio_next_pts = 0
-    for rframe in audio_resampler.resample(frame_in):
-        if rframe.pts is None:
-            rframe.pts = audio_next_pts
-        audio_next_pts += rframe.samples
-        rframe.sample_rate = frame_in.sample_rate
-        container.mux(audio_stream.encode(rframe))
-
-    # flush audio encoder
-    for packet in audio_stream.encode():
-        container.mux(packet)
-
-
-def _write_audio(
-    container: av.container.Container,
-    audio_stream: av.audio.AudioStream,
-    samples: torch.Tensor,
-    audio_sample_rate: int,
-) -> None:
-    if samples.ndim == 1:
-        samples = samples[:, None]
-
-    if samples.shape[1] != 2 and samples.shape[0] == 2:
-        samples = samples.T
-
-    if samples.shape[1] != 2:
-        raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
-
-    # Convert to int16 packed for ingestion; resampler converts to encoder fmt.
-    if samples.dtype != torch.int16:
-        samples = torch.clip(samples, -1.0, 1.0)
-        samples = (samples * 32767.0).to(torch.int16)
-
-    frame_in = av.AudioFrame.from_ndarray(
-        samples.contiguous().reshape(1, -1).cpu().numpy(),
-        format="s16",
-        layout="stereo",
-    )
-    frame_in.sample_rate = audio_sample_rate
-
-    _resample_audio(container, audio_stream, frame_in)
-
-
-def encode_video(
-    video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
-) -> None:
-    video_np = video.cpu().numpy()
-
-    _, height, width, _ = video_np.shape
-
-    container = av.open(output_path, mode="w")
-    stream = container.add_stream("libx264", rate=int(fps))
-    stream.width = width
-    stream.height = height
-    stream.pix_fmt = "yuv420p"
-
-    if audio is not None:
-        if audio_sample_rate is None:
-            raise ValueError("audio_sample_rate is required when audio is provided")
-
-        audio_stream = _prepare_audio_stream(container, audio_sample_rate)
-
-    for frame_array in video_np:
-        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
-        for packet in stream.encode(frame):
-            container.mux(packet)
-
-    # Flush encoder
-    for packet in stream.encode():
-        container.mux(packet)
-
-    if audio is not None:
-        _write_audio(container, audio_stream, audio, audio_sample_rate)
-
-    container.close()
+from diffusers.pipelines.ltx2.export_utils import encode_video


 def parse_args():
--- a/scripts/ltx2_test_full_pipeline_i2v.py
+++ b/scripts/ltx2_test_full_pipeline_i2v.py
@@ -1,118 +1,11 @@
 import argparse
 import os
-from fractions import Fraction
-from typing import Optional

-import av  # Needs to be installed separately (`pip install av`)
 import torch
 from PIL import Image

 from diffusers.pipelines.ltx2 import LTX2ImageToVideoPipeline
-
-
-# Video export functions copied from original LTX 2.0 code
-def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream:
-    """
-    Prepare the audio stream for writing.
-    """
-    audio_stream = container.add_stream("aac", rate=audio_sample_rate)
-    audio_stream.codec_context.sample_rate = audio_sample_rate
-    audio_stream.codec_context.layout = "stereo"
-    audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
-    return audio_stream
-
-
-def _resample_audio(
-    container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame
-) -> None:
-    cc = audio_stream.codec_context
-
-    # Use the encoder's format/layout/rate as the *target*
-    target_format = cc.format or "fltp"  # AAC → usually fltp
-    target_layout = cc.layout or "stereo"
-    target_rate = cc.sample_rate or frame_in.sample_rate
-
-    audio_resampler = av.audio.resampler.AudioResampler(
-        format=target_format,
-        layout=target_layout,
-        rate=target_rate,
-    )
-
-    audio_next_pts = 0
-    for rframe in audio_resampler.resample(frame_in):
-        if rframe.pts is None:
-            rframe.pts = audio_next_pts
-        audio_next_pts += rframe.samples
-        rframe.sample_rate = frame_in.sample_rate
-        container.mux(audio_stream.encode(rframe))
-
-    # flush audio encoder
-    for packet in audio_stream.encode():
-        container.mux(packet)
-
-
-def _write_audio(
-    container: av.container.Container,
-    audio_stream: av.audio.AudioStream,
-    samples: torch.Tensor,
-    audio_sample_rate: int,
-) -> None:
-    if samples.ndim == 1:
-        samples = samples[:, None]
-
-    if samples.shape[1] != 2 and samples.shape[0] == 2:
-        samples = samples.T
-
-    if samples.shape[1] != 2:
-        raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
-
-    # Convert to int16 packed for ingestion; resampler converts to encoder fmt.
-    if samples.dtype != torch.int16:
-        samples = torch.clip(samples, -1.0, 1.0)
-        samples = (samples * 32767.0).to(torch.int16)
-
-    frame_in = av.AudioFrame.from_ndarray(
-        samples.contiguous().reshape(1, -1).cpu().numpy(),
-        format="s16",
-        layout="stereo",
-    )
-    frame_in.sample_rate = audio_sample_rate
-
-    _resample_audio(container, audio_stream, frame_in)
-
-
-def encode_video(
-    video: torch.Tensor, fps: int, audio: Optional[torch.Tensor], audio_sample_rate: Optional[int], output_path: str
-) -> None:
-    video_np = video.cpu().numpy()
-
-    _, height, width, _ = video_np.shape
-
-    container = av.open(output_path, mode="w")
-    stream = container.add_stream("libx264", rate=int(fps))
-    stream.width = width
-    stream.height = height
-    stream.pix_fmt = "yuv420p"
-
-    if audio is not None:
-        if audio_sample_rate is None:
-            raise ValueError("audio_sample_rate is required when audio is provided")
-
-        audio_stream = _prepare_audio_stream(container, audio_sample_rate)
-
-    for frame_array in video_np:
-        frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
-        for packet in stream.encode(frame):
-            container.mux(packet)
-
-    # Flush encoder
-    for packet in stream.encode():
-        container.mux(packet)
-
-    if audio is not None:
-        _write_audio(container, audio_stream, audio, audio_sample_rate)
-
-    container.close()
+from diffusers.pipelines.ltx2.export_utils import encode_video


 def parse_args():