From a7d6916afcf3e291cbd65ef9775a108ae750098f Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Tue, 6 Jan 2026 05:58:31 +0100 Subject: [PATCH] Add test script for LTX 2.0 latent upsampling --- scripts/ltx2_test_latent_upsampler.py | 147 ++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 scripts/ltx2_test_latent_upsampler.py diff --git a/scripts/ltx2_test_latent_upsampler.py b/scripts/ltx2_test_latent_upsampler.py new file mode 100644 index 0000000000..5194e32d5c --- /dev/null +++ b/scripts/ltx2_test_latent_upsampler.py @@ -0,0 +1,147 @@ +import argparse +import gc +import os + +import torch + +from diffusers import AutoencoderKLLTX2Video +from diffusers.utils import load_image +from diffusers.pipelines.ltx2 import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline, LTX2LatentUpsamplerModel +from diffusers.pipelines.ltx2.export_utils import encode_video + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument("--model_id", type=str, default="diffusers-internal-dev/new-ltx-model") + parser.add_argument("--revision", type=str, default="main") + + parser.add_argument("--image_path", required=True, type=str) + parser.add_argument( + "--prompt", + type=str, + default=( + "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart " + "in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in " + "slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless " + "motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep " + "darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and " + "scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground " + "dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity " + "motion, cinematic lighting, and a breath-taking, movie-like shot." + ), + ) + parser.add_argument( + "--negative_prompt", + type=str, + default=( + "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion " + "artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static." + ), + ) + + parser.add_argument("--num_inference_steps", type=int, default=40) + parser.add_argument("--height", type=int, default=512) + parser.add_argument("--width", type=int, default=768) + parser.add_argument("--num_frames", type=int, default=121) + parser.add_argument("--frame_rate", type=float, default=25.0) + parser.add_argument("--guidance_scale", type=float, default=3.0) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--apply_scheduler_fix", action="store_true") + + parser.add_argument("--device", type=str, default="cuda:0") + parser.add_argument("--dtype", type=str, default="bf16") + parser.add_argument("--cpu_offload", action="store_true") + + parser.add_argument( + "--output_dir", + type=str, + default="samples", + help="Output directory for generated video", + ) + parser.add_argument( + "--output_filename", + type=str, + default="ltx2_i2v_video_upsampled.mp4", + help="Filename of the exported generated video", + ) + + args = parser.parse_args() + args.dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float32 + return args + + +def main(args): + pipeline = LTX2ImageToVideoPipeline.from_pretrained( + args.model_id, revision=args.revision, torch_dtype=args.dtype, + ) + if args.cpu_offload: + pipeline.enable_model_cpu_offload() + else: + pipeline.to(device=args.device) + + image = load_image(args.image_path) + + video, audio = pipeline( + image=image, + prompt=args.prompt, + negative_prompt=args.negative_prompt, + height=args.height, + width=args.width, + num_frames=args.num_frames, + frame_rate=args.frame_rate, + num_inference_steps=args.num_inference_steps, + guidance_scale=args.guidance_scale, + generator=torch.Generator(device=args.device).manual_seed(args.seed), + output_type="pil", + return_dict=False, + ) + + # upsample_pipeline = LTX2LatentUpsamplePipeline.from_pretrained( + # args.model_id, revision=args.revision, torch_dtype=args.dtype, + # ) + output_sampling_rate = pipeline.vocoder.config.output_sampling_rate + pipeline.to(device="cpu") + del pipeline # Otherwise there might be an OOM error? + torch.cuda.empty_cache() + gc.collect() + + vae = AutoencoderKLLTX2Video.from_pretrained( + args.model_id, + subfolder="vae", + revision=args.revision, + torch_dtype=args.dtype, + ) + latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained( + args.model_id, + subfolder="latent_upsampler", + revision=args.revision, + torch_dtype=args.dtype, + ) + upsample_pipeline = LTX2LatentUpsamplePipeline(vae=vae, latent_upsampler=latent_upsampler) + upsample_pipeline.to(device=args.device) + + video = upsample_pipeline( + video=video, + height=args.height, + width=args.width, + output_type="np", + return_dict=False, + )[0] + + # Convert video to uint8 (but keep as NumPy array) + video = (video * 255).round().astype("uint8") + video = torch.from_numpy(video) + + encode_video( + video[0], + fps=args.frame_rate, + audio=audio[0].float().cpu(), + audio_sample_rate=output_sampling_rate, # should be 24000 + output_path=os.path.join(args.output_dir, args.output_filename), + ) + + +if __name__ == '__main__': + args = parse_args() + main(args)