From 595f485ad8e1449eeae29639bd5e09b3887eb4f0 Mon Sep 17 00:00:00 2001 From: Daniel Gu Date: Tue, 23 Dec 2025 07:41:28 +0100 Subject: [PATCH] LTX 2.0 scheduler and full pipeline conversion --- scripts/convert_ltx2_to_diffusers.py | 33 ++++++++++++++++--- src/diffusers/pipelines/ltx2/pipeline_ltx2.py | 6 ++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/scripts/convert_ltx2_to_diffusers.py b/scripts/convert_ltx2_to_diffusers.py index 78494a52b9..6c4cac7396 100644 --- a/scripts/convert_ltx2_to_diffusers.py +++ b/scripts/convert_ltx2_to_diffusers.py @@ -7,9 +7,15 @@ import safetensors.torch import torch from accelerate import init_empty_weights from huggingface_hub import hf_hub_download -from transformers import AutoModel, AutoProcessor +from transformers import AutoModel, AutoTokenizer -from diffusers import AutoencoderKLLTX2Audio, AutoencoderKLLTX2Video, LTX2VideoTransformer3DModel +from diffusers import ( + AutoencoderKLLTX2Audio, + AutoencoderKLLTX2Video, + FlowMatchEulerDiscreteScheduler, + LTX2Pipeline, + LTX2VideoTransformer3DModel, +) from diffusers.utils.import_utils import is_accelerate_available from diffusers.pipelines.ltx2.text_encoder import LTX2AudioVisualTextEncoder from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder @@ -721,12 +727,31 @@ def main(args): if not args.full_pipeline: text_encoder.to(text_encoder_dtype).save_pretrained(os.path.join(args.output_path, "text_encoder")) - tokenizer = AutoProcessor.from_pretrained(args.tokenizer_id) + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_id) if not args.full_pipeline: tokenizer.save_pretrained(os.path.join(args.output_path, "tokenizer")) if args.full_pipeline: - pass + scheduler = FlowMatchEulerDiscreteScheduler( + use_dynamic_shifting=True, + base_shift=0.95, + max_shift=2.05, + base_image_seq_len=1024, + max_image_seq_len=4096, + shift_terminal=0.1, + ) + + pipe = LTX2Pipeline( + scheduler=scheduler, + vae=vae, + audio_vae=audio_vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + transformer=transformer, + vocoder=vocoder, + ) + + pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB") if __name__ == '__main__': diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py index af9b0096fd..eff87c08a3 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py @@ -883,10 +883,10 @@ class LTX2Pipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMix sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) mu = calculate_shift( video_sequence_length, - self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("base_image_seq_len", 1024), self.scheduler.config.get("max_image_seq_len", 4096), - self.scheduler.config.get("base_shift", 0.5), - self.scheduler.config.get("max_shift", 1.15), + self.scheduler.config.get("base_shift", 0.95), + self.scheduler.config.get("max_shift", 2.05), ) timesteps, num_inference_steps = retrieve_timesteps( self.scheduler,