diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py index 7cbcca67d2..103e324e11 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2.py @@ -47,24 +47,36 @@ EXAMPLE_DOC_STRING = """ Examples: ```py >>> import torch - >>> from diffusers import LTXPipeline - >>> from diffusers.utils import export_to_video + >>> from diffusers import LTX2Pipeline + >>> from diffusers.pipelines.ltx2.export_utils import encode_video - >>> pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16) - >>> pipe.to("cuda") + >>> pipe = LTX2Pipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16) + >>> pipe.enable_model_cpu_offload() >>> prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage" >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" - >>> video = pipe( + >>> frame_rate = 24.0 + >>> video, audio = pipe( ... prompt=prompt, ... negative_prompt=negative_prompt, - ... width=704, - ... height=480, - ... num_frames=161, - ... num_inference_steps=50, - ... ).frames[0] - >>> export_to_video(video, "output.mp4", fps=24) + ... width=768, + ... height=512, + ... frame_rate=frame_rate, + ... num_frames=121, + ... output_type="np", + ... return_dict=False, + ... ) + >>> video = (video * 255).round().astype("uint8") + >>> video = torch.from_numpy(video) + + >>> encode_video( + ... video[0], + ... fps=frame_rate, + ... audio=audio[0].float().cpu(), + ... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000 + ... output_path="video.mp4", + ... ) ``` """ diff --git a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py index 0a707806ce..df0faa2b07 100644 --- a/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py +++ b/src/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.py @@ -48,11 +48,12 @@ EXAMPLE_DOC_STRING = """ Examples: ```py >>> import torch - >>> from diffusers import LTX2ImageToVideoPipeline - >>> from diffusers.utils import export_to_video, load_image + >>> from diffusers import LTX2Pipeline + >>> from diffusers.pipelines.ltx2.export_utils import encode_video + >>> from diffusers.utils import load_image - >>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video-2", torch_dtype=torch.bfloat16) - >>> pipe.to("cuda") + >>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16) + >>> pipe.enable_model_cpu_offload() >>> image = load_image( ... "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png" @@ -60,16 +61,28 @@ EXAMPLE_DOC_STRING = """ >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background." >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" + >>> frame_rate = 24.0 >>> video = pipe( ... image=image, ... prompt=prompt, ... negative_prompt=negative_prompt, - ... width=704, - ... height=480, + ... width=768, + ... height=512, ... num_frames=121, - ... num_inference_steps=40, - ... ).frames[0] - >>> export_to_video(video, "output.mp4", fps=24) + ... frame_rate=frame_rate, + ... output_type="np", + ... return_dict=False, + ... ) + >>> video = (video * 255).round().astype("uint8") + >>> video = torch.from_numpy(video) + + >>> encode_video( + ... video[0], + ... fps=frame_rate, + ... audio=audio[0].float().cpu(), + ... audio_sample_rate=pipe.vocoder.config.output_sampling_rate, # should be 24000 + ... output_path="video.mp4", + ... ) ``` """