mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
Kandinsky 5.0 Video Pro and Image Lite (#12664)
* add transformer pipeline first version --------- Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com> Co-authored-by: YiYi Xu <yixu310@gmail.com> Co-authored-by: Charles <charles@huggingface.co> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: dmitrienkoae <dmitrienko.ae@phystech.edu> Co-authored-by: nvvaulin <nvvaulin@gmail.com>
This commit is contained in:
@@ -664,6 +664,8 @@
|
||||
title: HunyuanVideo1.5
|
||||
- local: api/pipelines/i2vgenxl
|
||||
title: I2VGen-XL
|
||||
- local: api/pipelines/kandinsky5_image
|
||||
title: Kandinsky 5.0 Image
|
||||
- local: api/pipelines/kandinsky5_video
|
||||
title: Kandinsky 5.0 Video
|
||||
- local: api/pipelines/latte
|
||||
|
||||
112
docs/source/en/api/pipelines/kandinsky5_image.md
Normal file
112
docs/source/en/api/pipelines/kandinsky5_image.md
Normal file
@@ -0,0 +1,112 @@
|
||||
<!--Copyright 2025 The HuggingFace Team and Kandinsky Lab Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Kandinsky 5.0 Image
|
||||
|
||||
[Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation.
|
||||
|
||||
Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters)
|
||||
|
||||
The model introduces several key innovations:
|
||||
- **Latent diffusion pipeline** with **Flow Matching** for improved training stability
|
||||
- **Diffusion Transformer (DiT)** as the main generative backbone with cross-attention to text embeddings
|
||||
- Dual text encoding using **Qwen2.5-VL** and **CLIP** for comprehensive text understanding
|
||||
- **Flux VAE** for efficient image encoding and decoding
|
||||
|
||||
The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
|
||||
|
||||
|
||||
## Available Models
|
||||
|
||||
Kandinsky 5.0 Image Lite:
|
||||
| model_id | Description | Use Cases |
|
||||
|------------|-------------|-----------|
|
||||
| [**kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers) | 6B image Supervised Fine-Tuned model | Highest generation quality |
|
||||
| [**kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers) | 6B image editing Supervised Fine-Tuned model | Highest generation quality |
|
||||
| [**kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers) | 6B image Base pretrained model | Research and fine-tuning |
|
||||
| [**kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers) | 6B image editing Base pretrained model | Research and fine-tuning |
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Text-to-Image Generation
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import Kandinsky5T2IPipeline
|
||||
|
||||
# Load the pipeline
|
||||
model_id = "kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers"
|
||||
pipe = Kandinsky5T2IPipeline.from_pretrained(model_id)
|
||||
_ = pipe.to(device='cuda',dtype=torch.bfloat16)
|
||||
|
||||
# Generate image
|
||||
prompt = "A fluffy, expressive cat wearing a bright red hat with a soft, slightly textured fabric. The hat should look cozy and well-fitted on the cat’s head. On the front of the hat, add clean, bold white text that reads “SWEET”, clearly visible and neatly centered. Ensure the overall lighting highlights the hat’s color and the cat’s fur details."
|
||||
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
height=1024,
|
||||
width=1024,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=3.5,
|
||||
).image[0]
|
||||
```
|
||||
|
||||
### Basic Image-to-Image Generation
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import Kandinsky5I2IPipeline
|
||||
from diffusers.utils import load_image
|
||||
# Load the pipeline
|
||||
model_id = "kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers"
|
||||
pipe = Kandinsky5I2IPipeline.from_pretrained(model_id)
|
||||
|
||||
_ = pipe.to(device='cuda',dtype=torch.bfloat16)
|
||||
pipe.enable_model_cpu_offload() # <--- Enable CPU offloading for single GPU inference
|
||||
|
||||
# Edit the input image
|
||||
image = load_image(
|
||||
"https://huggingface.co/kandinsky-community/kandinsky-3/resolve/main/assets/title.jpg?download=true"
|
||||
)
|
||||
|
||||
prompt = "Change the background from a winter night scene to a bright summer day. Place the character on a sandy beach with clear blue sky, soft sunlight, and gentle waves in the distance. Replace the winter clothing with a light short-sleeved T-shirt (in soft pastel colors) and casual shorts. Ensure the character’s fur reflects warm daylight instead of cold winter tones. Add small beach details such as seashells, footprints in the sand, and a few scattered beach toys nearby. Keep the oranges in the scene, but place them naturally on the sand."
|
||||
negative_prompt = ""
|
||||
|
||||
output = pipe(
|
||||
image=image,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
guidance_scale=3.5,
|
||||
).image[0]
|
||||
```
|
||||
|
||||
|
||||
## Kandinsky5T2IPipeline
|
||||
|
||||
[[autodoc]] Kandinsky5T2IPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## Kandinsky5I2IPipeline
|
||||
|
||||
[[autodoc]] Kandinsky5I2IPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
## Citation
|
||||
```bibtex
|
||||
@misc{kandinsky2025,
|
||||
author = {Alexander Belykh and Alexander Varlamov and Alexey Letunovskiy and Anastasia Aliaskina and Anastasia Maltseva and Anastasiia Kargapoltseva and Andrey Shutkin and Anna Averchenkova and Anna Dmitrienko and Bulat Akhmatov and Denis Dimitrov and Denis Koposov and Denis Parkhomenko and Dmitrii and Ilya Vasiliev and Ivan Kirillov and Julia Agafonova and Kirill Chernyshev and Kormilitsyn Semen and Lev Novitskiy and Maria Kovaleva and Mikhail Mamaev and Mikhailov and Nikita Kiselev and Nikita Osterov and Nikolai Gerasimenko and Nikolai Vaulin and Olga Kim and Olga Vdovchenko and Polina Gavrilova and Polina Mikhailova and Tatiana Nikulina and Viacheslav Vasilev and Vladimir Arkhipkin and Vladimir Korviakov and Vladimir Polovnikov and Yury Kolabushin},
|
||||
title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
|
||||
howpublished = {\url{https://github.com/kandinskylab/Kandinsky-5}},
|
||||
year = 2025
|
||||
}
|
||||
```
|
||||
@@ -1,4 +1,4 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
<!--Copyright 2025 The HuggingFace Team Kandinsky Lab Team. All rights reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
@@ -9,10 +9,11 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Kandinsky 5.0 Video
|
||||
|
||||
Kandinsky 5.0 Video is created by the Kandinsky team: Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev, Anna Dmitrienko, Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova, Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko, Denis Dimitrov
|
||||
[Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation.
|
||||
|
||||
Kandinsky 5.0 Lite line-up of lightweight video generation models (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger models and offers the best understanding of Russian concepts in the open-source ecosystem.
|
||||
|
||||
Kandinsky 5.0 is a family of diffusion models for Video & Image generation. Kandinsky 5.0 T2V Lite is a lightweight video generation model (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger models and offers the best understanding of Russian concepts in the open-source ecosystem.
|
||||
Kandinsky 5.0 Pro line-up of large high quality video generation models (19B parameters). It offers high qualty generation in HD and more generation formats like I2V.
|
||||
|
||||
The model introduces several key innovations:
|
||||
- **Latent diffusion pipeline** with **Flow Matching** for improved training stability
|
||||
@@ -21,45 +22,77 @@ The model introduces several key innovations:
|
||||
- **HunyuanVideo 3D VAE** for efficient video encoding and decoding
|
||||
- **Sparse attention mechanisms** (NABLA) for efficient long-sequence processing
|
||||
|
||||
The original codebase can be found at [ai-forever/Kandinsky-5](https://github.com/ai-forever/Kandinsky-5).
|
||||
The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
|
||||
|
||||
> [!TIP]
|
||||
> Check out the [AI Forever](https://huggingface.co/ai-forever) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
|
||||
> Check out the [Kandinsky Lab](https://huggingface.co/kandinskylab) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
|
||||
|
||||
## Available Models
|
||||
|
||||
Kandinsky 5.0 T2V Lite comes in several variants optimized for different use cases:
|
||||
|
||||
Kandinsky 5.0 T2V Pro:
|
||||
| model_id | Description | Use Cases |
|
||||
|------------|-------------|-----------|
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers** | 5 second Supervised Fine-Tuned model | Highest generation quality |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers** | 10 second Supervised Fine-Tuned model | Highest generation quality |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers** | 5 second Classifier-Free Guidance distilled | 2× faster inference |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers** | 10 second Classifier-Free Guidance distilled | 2× faster inference |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers** | 5 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers** | 10 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers** | 5 second Base pretrained model | Research and fine-tuning |
|
||||
| **ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers** | 10 second Base pretrained model | Research and fine-tuning |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers** | 5 second Text-to-Video Pro model | High-quality text-to-video generation |
|
||||
| **kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers** | 5 second Image-to-Video Pro model | High-quality image-to-video generation |
|
||||
|
||||
All models are available in 5-second and 10-second video generation versions.
|
||||
Kandinsky 5.0 T2V Lite:
|
||||
| model_id | Description | Use Cases |
|
||||
|------------|-------------|-----------|
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers** | 5 second Supervised Fine-Tuned model | Highest generation quality |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers** | 10 second Supervised Fine-Tuned model | Highest generation quality |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers** | 5 second Classifier-Free Guidance distilled | 2× faster inference |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers** | 10 second Classifier-Free Guidance distilled | 2× faster inference |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers** | 5 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers** | 10 second Diffusion distilled to 16 steps | 6× faster inference, minimal quality loss |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers** | 5 second Base pretrained model | Research and fine-tuning |
|
||||
| **kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers** | 10 second Base pretrained model | Research and fine-tuning |
|
||||
|
||||
## Kandinsky5T2VPipeline
|
||||
|
||||
[[autodoc]] Kandinsky5T2VPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Basic Text-to-Video Generation
|
||||
|
||||
#### Pro
|
||||
**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload()
|
||||
```python
|
||||
import torch
|
||||
from diffusers import Kandinsky5T2VPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
# Load the pipeline
|
||||
model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
|
||||
model_id = "kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers"
|
||||
pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
|
||||
pipe = pipe.to("cuda")
|
||||
pipeline.transformer.set_attention_backend("flex") # <--- Set attention bakend to Flex
|
||||
pipeline.enable_model_cpu_offload() # <--- Enable cpu offloading for single GPU inference
|
||||
pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs
|
||||
|
||||
# Generate video
|
||||
prompt = "A cat and a dog baking a cake together in a kitchen."
|
||||
negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
|
||||
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=768,
|
||||
width=1024,
|
||||
num_frames=121, # ~5 seconds at 24fps
|
||||
num_inference_steps=50,
|
||||
guidance_scale=5.0,
|
||||
).frames[0]
|
||||
|
||||
export_to_video(output, "output.mp4", fps=24, quality=9)
|
||||
```
|
||||
|
||||
#### Lite
|
||||
```python
|
||||
import torch
|
||||
from diffusers import Kandinsky5T2VPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
# Load the pipeline
|
||||
model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
|
||||
pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
@@ -85,14 +118,14 @@ export_to_video(output, "output.mp4", fps=24, quality=9)
|
||||
|
||||
```python
|
||||
pipe = Kandinsky5T2VPipeline.from_pretrained(
|
||||
"ai-forever/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers",
|
||||
"kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
pipe.transformer.set_attention_backend(
|
||||
"flex"
|
||||
) # <--- Sett attention bakend to Flex
|
||||
) # <--- Set attention bakend to Flex
|
||||
pipe.transformer.compile(
|
||||
mode="max-autotune-no-cudagraphs",
|
||||
dynamic=True
|
||||
@@ -118,7 +151,7 @@ export_to_video(output, "output.mp4", fps=24, quality=9)
|
||||
**⚠️ Warning!** all nocfg and diffusion distilled models should be infered wothout CFG (```guidance_scale=1.0```):
|
||||
|
||||
```python
|
||||
model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
|
||||
model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
|
||||
pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
@@ -132,18 +165,145 @@ export_to_video(output, "output.mp4", fps=24, quality=9)
|
||||
```
|
||||
|
||||
|
||||
### Basic Image-to-Video Generation
|
||||
**⚠️ Warning!** all Pro models should be infered with pipeline.enable_model_cpu_offload()
|
||||
```python
|
||||
import torch
|
||||
from diffusers import Kandinsky5T2VPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
# Load the pipeline
|
||||
model_id = "kandinskylab/Kandinsky-5.0-I2V-Pro-sft-5s-Diffusers"
|
||||
pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
|
||||
pipe = pipe.to("cuda")
|
||||
pipeline.transformer.set_attention_backend("flex") # <--- Set attention bakend to Flex
|
||||
pipeline.enable_model_cpu_offload() # <--- Enable cpu offloading for single GPU inference
|
||||
pipeline.transformer.compile(mode="max-autotune-no-cudagraphs", dynamic=True) # <--- Compile with max-autotune-no-cudagraphs
|
||||
|
||||
# Generate video
|
||||
image = load_image(
|
||||
"https://huggingface.co/kandinsky-community/kandinsky-3/resolve/main/assets/title.jpg?download=true"
|
||||
)
|
||||
height = 896
|
||||
width = 896
|
||||
image = image.resize((width, height))
|
||||
|
||||
prompt = "An funny furry creture smiles happily and holds a sign that says 'Kandinsky'"
|
||||
negative_prompt = ""
|
||||
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
num_frames=121, # ~5 seconds at 24fps
|
||||
num_inference_steps=50,
|
||||
guidance_scale=5.0,
|
||||
).frames[0]
|
||||
|
||||
export_to_video(output, "output.mp4", fps=24, quality=9)
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Kandinsky 5.0 Pro Side-by-Side evaluation
|
||||
|
||||
<table border="0" style="width: 200; text-align: left; margin-top: 20px;">
|
||||
<tr>
|
||||
<td>
|
||||
<img width="200" alt="image" src="https://github.com/user-attachments/assets/73e5ff00-2735-40fd-8f01-767de9181918" />
|
||||
</td>
|
||||
<td>
|
||||
<img width="200" alt="image" src="https://github.com/user-attachments/assets/f449a9e7-74b7-481d-82da-02723e396acd" />
|
||||
</td>
|
||||
|
||||
<tr>
|
||||
<td>
|
||||
Comparison with Veo 3
|
||||
</td>
|
||||
<td>
|
||||
Comparison with Veo 3 fast
|
||||
</td>
|
||||
<tr>
|
||||
<td>
|
||||
<img width="200" alt="image" src="https://github.com/user-attachments/assets/a6902fb6-b5e8-4093-adad-aa4caab79c6d" />
|
||||
</td>
|
||||
<td>
|
||||
<img width="200" alt="image" src="https://github.com/user-attachments/assets/09986015-3d07-4de8-b942-c145039b9b2d" />
|
||||
</td>
|
||||
<tr>
|
||||
<td>
|
||||
Comparison with Wan 2.2 A14B Text-to-Video mode
|
||||
</td>
|
||||
<td>
|
||||
Comparison with Wan 2.2 A14B Image-to-Video mode
|
||||
</td>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
## Kandinsky 5.0 Lite Side-by-Side evaluation
|
||||
|
||||
The evaluation is based on the expanded prompts from the [Movie Gen benchmark](https://github.com/facebookresearch/MovieGenBench), which are available in the expanded_prompt column of the benchmark/moviegen_bench.csv file.
|
||||
|
||||
<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_sora.jpg" width=400 >
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_14B.jpg" width=400 >
|
||||
</td>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_5B.jpg" width=400 >
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.2_A14B.jpg" width=400 >
|
||||
</td>
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_vs_wan_2.1_1.3B.jpg" width=400 >
|
||||
</td>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
|
||||
## Kandinsky 5.0 Lite Distill Side-by-Side evaluation
|
||||
|
||||
<table border="0" style="width: 400; text-align: left; margin-top: 20px;">
|
||||
<tr>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_5s_vs_kandinsky_5_video_lite_distill_5s.jpg" width=400 >
|
||||
</td>
|
||||
<td>
|
||||
<img src="https://github.com/kandinskylab/kandinsky-5/raw/main/assets/sbs/kandinsky_5_video_lite_10s_vs_kandinsky_5_video_lite_distill_10s.jpg" width=400 >
|
||||
</td>
|
||||
|
||||
</table>
|
||||
|
||||
## Kandinsky5T2VPipeline
|
||||
|
||||
[[autodoc]] Kandinsky5T2VPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## Kandinsky5I2VPipeline
|
||||
|
||||
[[autodoc]] Kandinsky5I2VPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
|
||||
## Citation
|
||||
```bibtex
|
||||
@misc{kandinsky2025,
|
||||
author = {Alexey Letunovskiy and Maria Kovaleva and Ivan Kirillov and Lev Novitskiy and Denis Koposov and
|
||||
Dmitrii Mikhailov and Anna Averchenkova and Andrey Shutkin and Julia Agafonova and Olga Kim and
|
||||
Anastasiia Kargapoltseva and Nikita Kiselev and Vladimir Arkhipkin and Vladimir Korviakov and
|
||||
Nikolai Gerasimenko and Denis Parkhomenko and Anna Dmitrienko and Anastasia Maltseva and
|
||||
Kirill Chernyshev and Ilia Vasiliev and Viacheslav Vasilev and Vladimir Polovnikov and
|
||||
Yury Kolabushin and Alexander Belykh and Mikhail Mamaev and Anastasia Aliaskina and
|
||||
Tatiana Nikulina and Polina Gavrilova and Denis Dimitrov},
|
||||
author = {Alexander Belykh and Alexander Varlamov and Alexey Letunovskiy and Anastasia Aliaskina and Anastasia Maltseva and Anastasiia Kargapoltseva and Andrey Shutkin and Anna Averchenkova and Anna Dmitrienko and Bulat Akhmatov and Denis Dimitrov and Denis Koposov and Denis Parkhomenko and Dmitrii and Ilya Vasiliev and Ivan Kirillov and Julia Agafonova and Kirill Chernyshev and Kormilitsyn Semen and Lev Novitskiy and Maria Kovaleva and Mikhail Mamaev and Mikhailov and Nikita Kiselev and Nikita Osterov and Nikolai Gerasimenko and Nikolai Vaulin and Olga Kim and Olga Vdovchenko and Polina Gavrilova and Polina Mikhailova and Tatiana Nikulina and Viacheslav Vasilev and Vladimir Arkhipkin and Vladimir Korviakov and Vladimir Polovnikov and Yury Kolabushin},
|
||||
title = {Kandinsky 5.0: A family of diffusion models for Video & Image generation},
|
||||
howpublished = {\url{https://github.com/ai-forever/Kandinsky-5}},
|
||||
howpublished = {\url{https://github.com/kandinskylab/Kandinsky-5}},
|
||||
year = 2025
|
||||
}
|
||||
```
|
||||
|
||||
@@ -499,6 +499,9 @@ else:
|
||||
"ImageTextPipelineOutput",
|
||||
"Kandinsky3Img2ImgPipeline",
|
||||
"Kandinsky3Pipeline",
|
||||
"Kandinsky5I2IPipeline",
|
||||
"Kandinsky5I2VPipeline",
|
||||
"Kandinsky5T2IPipeline",
|
||||
"Kandinsky5T2VPipeline",
|
||||
"KandinskyCombinedPipeline",
|
||||
"KandinskyImg2ImgCombinedPipeline",
|
||||
@@ -1194,6 +1197,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
ImageTextPipelineOutput,
|
||||
Kandinsky3Img2ImgPipeline,
|
||||
Kandinsky3Pipeline,
|
||||
Kandinsky5I2IPipeline,
|
||||
Kandinsky5I2VPipeline,
|
||||
Kandinsky5T2IPipeline,
|
||||
Kandinsky5T2VPipeline,
|
||||
KandinskyCombinedPipeline,
|
||||
KandinskyImg2ImgCombinedPipeline,
|
||||
|
||||
@@ -398,8 +398,13 @@ else:
|
||||
"WanVACEPipeline",
|
||||
"WanAnimatePipeline",
|
||||
]
|
||||
_import_structure["kandinsky5"] = [
|
||||
"Kandinsky5T2VPipeline",
|
||||
"Kandinsky5I2VPipeline",
|
||||
"Kandinsky5T2IPipeline",
|
||||
"Kandinsky5I2IPipeline",
|
||||
]
|
||||
_import_structure["z_image"] = ["ZImagePipeline"]
|
||||
_import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
|
||||
_import_structure["skyreels_v2"] = [
|
||||
"SkyReelsV2DiffusionForcingPipeline",
|
||||
"SkyReelsV2DiffusionForcingImageToVideoPipeline",
|
||||
@@ -695,7 +700,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
Kandinsky3Img2ImgPipeline,
|
||||
Kandinsky3Pipeline,
|
||||
)
|
||||
from .kandinsky5 import Kandinsky5T2VPipeline
|
||||
from .kandinsky5 import (
|
||||
Kandinsky5I2IPipeline,
|
||||
Kandinsky5I2VPipeline,
|
||||
Kandinsky5T2IPipeline,
|
||||
Kandinsky5T2VPipeline,
|
||||
)
|
||||
from .latent_consistency_models import (
|
||||
LatentConsistencyModelImg2ImgPipeline,
|
||||
LatentConsistencyModelPipeline,
|
||||
|
||||
@@ -23,6 +23,9 @@ except OptionalDependencyNotAvailable:
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_kandinsky"] = ["Kandinsky5T2VPipeline"]
|
||||
_import_structure["pipeline_kandinsky_i2i"] = ["Kandinsky5I2IPipeline"]
|
||||
_import_structure["pipeline_kandinsky_i2v"] = ["Kandinsky5I2VPipeline"]
|
||||
_import_structure["pipeline_kandinsky_t2i"] = ["Kandinsky5T2IPipeline"]
|
||||
|
||||
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
try:
|
||||
@@ -33,6 +36,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_kandinsky import Kandinsky5T2VPipeline
|
||||
from .pipeline_kandinsky_i2i import Kandinsky5I2IPipeline
|
||||
from .pipeline_kandinsky_i2v import Kandinsky5I2VPipeline
|
||||
from .pipeline_kandinsky_t2i import Kandinsky5T2IPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
@@ -25,7 +25,14 @@ from ...loaders import KandinskyLoraLoaderMixin
|
||||
from ...models import AutoencoderKLHunyuanVideo
|
||||
from ...models.transformers import Kandinsky5Transformer3DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
|
||||
|
||||
# Add imports for offloading and tiling
|
||||
from ...utils import (
|
||||
is_ftfy_available,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ...video_processor import VideoProcessor
|
||||
from ..pipeline_utils import DiffusionPipeline
|
||||
@@ -56,12 +63,17 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> from diffusers.utils import export_to_video
|
||||
|
||||
>>> # Available models:
|
||||
>>> # ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers
|
||||
>>> # ai-forever/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers
|
||||
>>> # ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers
|
||||
>>> # ai-forever/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-5s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-sft-10s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-nocfg-10s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-distilled16steps-10s-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2V-Lite-pretrain-10s-Diffusers
|
||||
|
||||
>>> model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
|
||||
>>> model_id = "kandinskylab/Kandinsky-5.0-T2V-Lite-sft-5s-Diffusers"
|
||||
>>> pipe = Kandinsky5T2VPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
@@ -84,7 +96,11 @@ EXAMPLE_DOC_STRING = """
|
||||
|
||||
|
||||
def basic_clean(text):
|
||||
"""Clean text using ftfy if available and unescape HTML entities."""
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Clean text using ftfy if available and unescape HTML entities.
|
||||
"""
|
||||
if is_ftfy_available():
|
||||
text = ftfy.fix_text(text)
|
||||
text = html.unescape(html.unescape(text))
|
||||
@@ -92,14 +108,22 @@ def basic_clean(text):
|
||||
|
||||
|
||||
def whitespace_clean(text):
|
||||
"""Normalize whitespace in text by replacing multiple spaces with single space."""
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Normalize whitespace in text by replacing multiple spaces with single space.
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def prompt_clean(text):
|
||||
"""Apply both basic cleaning and whitespace normalization to prompts."""
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Apply both basic cleaning and whitespace normalization to prompts.
|
||||
"""
|
||||
text = whitespace_clean(basic_clean(text))
|
||||
return text
|
||||
|
||||
@@ -115,13 +139,16 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
transformer ([`Kandinsky5Transformer3DModel`]):
|
||||
Conditional Transformer to denoise the encoded video latents.
|
||||
vae ([`AutoencoderKLHunyuanVideo`]):
|
||||
Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
|
||||
Variational Auto-Encoder Model [hunyuanvideo-community/HunyuanVideo
|
||||
(vae)](https://huggingface.co/hunyuanvideo-community/HunyuanVideo) to encode and decode videos to and from
|
||||
latent representations.
|
||||
text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
|
||||
Frozen text-encoder (Qwen2.5-VL).
|
||||
Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
|
||||
tokenizer ([`AutoProcessor`]):
|
||||
Tokenizer for Qwen2.5-VL.
|
||||
text_encoder_2 ([`CLIPTextModel`]):
|
||||
Frozen CLIP text encoder.
|
||||
Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
|
||||
specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer_2 ([`CLIPTokenizer`]):
|
||||
Tokenizer for CLIP.
|
||||
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
||||
@@ -179,6 +206,26 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
|
||||
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
||||
|
||||
def _get_scale_factor(self, height: int, width: int) -> tuple:
|
||||
"""
|
||||
Calculate the scale factor based on resolution.
|
||||
|
||||
Args:
|
||||
height (int): Video height
|
||||
width (int): Video width
|
||||
|
||||
Returns:
|
||||
tuple: Scale factor as (temporal_scale, height_scale, width_scale)
|
||||
"""
|
||||
|
||||
def between_480p(x):
|
||||
return 480 <= x <= 854
|
||||
|
||||
if between_480p(height) and between_480p(width):
|
||||
return (1, 2, 2)
|
||||
else:
|
||||
return (1, 3.16, 3.16)
|
||||
|
||||
@staticmethod
|
||||
def fast_sta_nabla(T: int, H: int, W: int, wT: int = 3, wH: int = 3, wW: int = 3, device="cuda") -> torch.Tensor:
|
||||
"""
|
||||
@@ -290,12 +337,32 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
full_texts = [self.prompt_template.format(p) for p in prompt]
|
||||
max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
|
||||
|
||||
untruncated_ids = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=None,
|
||||
videos=None,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
)["input_ids"]
|
||||
|
||||
if untruncated_ids.shape[-1] > max_allowed_len:
|
||||
for i, text in enumerate(full_texts):
|
||||
tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
|
||||
removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
|
||||
if len(removed_text) > 0:
|
||||
full_texts[i] = text[: -len(removed_text)]
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because `max_sequence_length` is set to "
|
||||
f" {max_sequence_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
inputs = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=None,
|
||||
videos=None,
|
||||
max_length=max_sequence_length + self.prompt_template_encode_start_idx,
|
||||
max_length=max_allowed_len,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
@@ -456,6 +523,7 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
prompt_cu_seqlens=None,
|
||||
negative_prompt_cu_seqlens=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
max_sequence_length=None,
|
||||
):
|
||||
"""
|
||||
Validate input parameters for the pipeline.
|
||||
@@ -476,6 +544,10 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
Raises:
|
||||
ValueError: If inputs are invalid
|
||||
"""
|
||||
|
||||
if max_sequence_length is not None and max_sequence_length > 1024:
|
||||
raise ValueError("max_sequence_length must be less than 1024")
|
||||
|
||||
if height % 16 != 0 or width % 16 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
|
||||
|
||||
@@ -597,11 +669,6 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
"""Get the current guidance scale value."""
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
"""Check if classifier-free guidance is enabled."""
|
||||
return self._guidance_scale > 1.0
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
"""Get the number of denoising timesteps."""
|
||||
@@ -639,7 +706,6 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 512,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for generation.
|
||||
@@ -704,6 +770,7 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
prompt_cu_seqlens=prompt_cu_seqlens,
|
||||
negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
|
||||
if num_frames % self.vae_scale_factor_temporal != 1:
|
||||
@@ -737,7 +804,7 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
if self.do_classifier_free_guidance:
|
||||
if self.guidance_scale > 1.0:
|
||||
if negative_prompt is None:
|
||||
negative_prompt = "Static, 2D cartoon, cartoon, 2d animation, paintings, images, worst quality, low quality, ugly, deformed, walking backwards"
|
||||
|
||||
@@ -792,10 +859,13 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
else None
|
||||
)
|
||||
|
||||
# 7. Sparse Params for efficient attention
|
||||
# 7. Calculate dynamic scale factor based on resolution
|
||||
scale_factor = self._get_scale_factor(height, width)
|
||||
|
||||
# 8. Sparse Params for efficient attention
|
||||
sparse_params = self.get_sparse_params(latents, device)
|
||||
|
||||
# 8. Denoising loop
|
||||
# 9. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
|
||||
@@ -814,12 +884,12 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=text_rope_pos,
|
||||
scale_factor=(1, 2, 2),
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
|
||||
if self.do_classifier_free_guidance and negative_prompt_embeds_qwen is not None:
|
||||
if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
|
||||
uncond_pred_velocity = self.transformer(
|
||||
hidden_states=latents.to(dtype),
|
||||
encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
|
||||
@@ -827,7 +897,7 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=negative_text_rope_pos,
|
||||
scale_factor=(1, 2, 2),
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
@@ -860,10 +930,10 @@ class Kandinsky5T2VPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
# 8. Post-processing - extract main latents
|
||||
# 10. Post-processing - extract main latents
|
||||
latents = latents[:, :, :, :, :num_channels_latents]
|
||||
|
||||
# 9. Decode latents to video
|
||||
# 11. Decode latents to video
|
||||
if output_type != "latent":
|
||||
latents = latents.to(self.vae.dtype)
|
||||
# Reshape and normalize latents
|
||||
|
||||
863
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2i.py
Normal file
863
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2i.py
Normal file
@@ -0,0 +1,863 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import html
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from transformers import CLIPTextModel, CLIPTokenizer, Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import PipelineImageInput, VaeImageProcessor
|
||||
from ...loaders import KandinskyLoraLoaderMixin
|
||||
from ...models import AutoencoderKL
|
||||
from ...models.transformers import Kandinsky5Transformer3DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
|
||||
# Add imports for offloading and tiling
|
||||
from ...utils import (
|
||||
is_ftfy_available,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ..pipeline_utils import DiffusionPipeline
|
||||
from .pipeline_output import KandinskyImagePipelineOutput
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
if is_ftfy_available():
|
||||
import ftfy
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from diffusers import Kandinsky5I2IPipeline
|
||||
|
||||
>>> # Available models:
|
||||
>>> # kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-I2I-Lite-pretrain-Diffusers
|
||||
|
||||
>>> model_id = "kandinskylab/Kandinsky-5.0-I2I-Lite-sft-Diffusers"
|
||||
>>> pipe = Kandinsky5I2IPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> prompt = "A cat and a dog baking a cake together in a kitchen."
|
||||
|
||||
>>> output = pipe(
|
||||
... prompt=prompt,
|
||||
... negative_prompt="",
|
||||
... height=1024,
|
||||
... width=1024,
|
||||
... num_inference_steps=50,
|
||||
... guidance_scale=3.5,
|
||||
... ).frames[0]
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
def basic_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Clean text using ftfy if available and unescape HTML entities.
|
||||
"""
|
||||
if is_ftfy_available():
|
||||
text = ftfy.fix_text(text)
|
||||
text = html.unescape(html.unescape(text))
|
||||
return text.strip()
|
||||
|
||||
|
||||
def whitespace_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Normalize whitespace in text by replacing multiple spaces with single space.
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def prompt_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Apply both basic cleaning and whitespace normalization to prompts.
|
||||
"""
|
||||
text = whitespace_clean(basic_clean(text))
|
||||
return text
|
||||
|
||||
|
||||
class Kandinsky5I2IPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
r"""
|
||||
Pipeline for image-to-image generation using Kandinsky 5.0.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
||||
|
||||
Args:
|
||||
transformer ([`Kandinsky5Transformer3DModel`]):
|
||||
Conditional Transformer to denoise the encoded image latents.
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder Model [black-forest-labs/FLUX.1-dev
|
||||
(vae)](https://huggingface.co/black-forest-labs/FLUX.1-dev) to encode and decode videos to and from latent
|
||||
representations.
|
||||
text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
|
||||
Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
|
||||
tokenizer ([`AutoProcessor`]):
|
||||
Tokenizer for Qwen2.5-VL.
|
||||
text_encoder_2 ([`CLIPTextModel`]):
|
||||
Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
|
||||
specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer_2 ([`CLIPTokenizer`]):
|
||||
Tokenizer for CLIP.
|
||||
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
||||
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds_qwen",
|
||||
"prompt_embeds_clip",
|
||||
"negative_prompt_embeds_qwen",
|
||||
"negative_prompt_embeds_clip",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transformer: Kandinsky5Transformer3DModel,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: Qwen2_5_VLForConditionalGeneration,
|
||||
tokenizer: Qwen2VLProcessor,
|
||||
text_encoder_2: CLIPTextModel,
|
||||
tokenizer_2: CLIPTokenizer,
|
||||
scheduler: FlowMatchEulerDiscreteScheduler,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(
|
||||
transformer=transformer,
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
text_encoder_2=text_encoder_2,
|
||||
tokenizer_2=tokenizer_2,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
self.prompt_template = "<|im_start|>system\nYou are a promt engineer. Based on the provided source image (first image) and target image (second image), create an interesting text prompt that can be used together with the source image to create the target image:<|im_end|><|im_start|>user{}<|vision_start|><|image_pad|><|vision_end|><|im_end|>"
|
||||
self.prompt_template_encode_start_idx = 55
|
||||
|
||||
self.vae_scale_factor_spatial = 8
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
||||
self.resolutions = [(1024, 1024), (640, 1408), (1408, 640), (768, 1280), (1280, 768), (896, 1152), (1152, 896)]
|
||||
|
||||
def _encode_prompt_qwen(
|
||||
self,
|
||||
prompt: List[str],
|
||||
image: Optional[PipelineImageInput] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
max_sequence_length: int = 1024,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
"""
|
||||
Encode prompt using Qwen2.5-VL text encoder.
|
||||
|
||||
This method processes the input prompt through the Qwen2.5-VL model to generate text embeddings suitable for
|
||||
image generation.
|
||||
|
||||
Args:
|
||||
prompt List[str]: Input list of prompts
|
||||
image (PipelineImageInput): Input list of images to condition the generation on
|
||||
device (torch.device): Device to run encoding on
|
||||
max_sequence_length (int): Maximum sequence length for tokenization
|
||||
dtype (torch.dtype): Data type for embeddings
|
||||
|
||||
Returns:
|
||||
Tuple[torch.Tensor, torch.Tensor]: Text embeddings and cumulative sequence lengths
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
if not isinstance(image, list):
|
||||
image = [image]
|
||||
image = [i.resize((i.size[0] // 2, i.size[1] // 2)) for i in image]
|
||||
full_texts = [self.prompt_template.format(p) for p in prompt]
|
||||
max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
|
||||
|
||||
untruncated_ids = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=image,
|
||||
videos=None,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
)["input_ids"]
|
||||
|
||||
if untruncated_ids.shape[-1] > max_allowed_len:
|
||||
for i, text in enumerate(full_texts):
|
||||
tokens = untruncated_ids[i]
|
||||
num_image_tokens = (tokens == self.tokenizer.image_token_id).sum()
|
||||
tokens = tokens[tokens != self.tokenizer.image_token_id][self.prompt_template_encode_start_idx : -3]
|
||||
removed_text = self.tokenizer.decode(tokens[max_sequence_length - num_image_tokens - 3 :])
|
||||
if len(removed_text) > 0:
|
||||
full_texts[i] = text[: -len(removed_text)]
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because `max_sequence_length` is set to "
|
||||
f" {max_sequence_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
inputs = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=image,
|
||||
videos=None,
|
||||
max_length=max_allowed_len,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(device)
|
||||
|
||||
embeds = self.text_encoder(
|
||||
**inputs,
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)["hidden_states"][-1][:, self.prompt_template_encode_start_idx :]
|
||||
|
||||
attention_mask = inputs["attention_mask"][:, self.prompt_template_encode_start_idx :]
|
||||
cu_seqlens = torch.cumsum(attention_mask.sum(1), dim=0)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(dtype=torch.int32)
|
||||
|
||||
return embeds.to(dtype), cu_seqlens
|
||||
|
||||
def _encode_prompt_clip(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
"""
|
||||
Encode prompt using CLIP text encoder.
|
||||
|
||||
This method processes the input prompt through the CLIP model to generate pooled embeddings that capture
|
||||
semantic information.
|
||||
|
||||
Args:
|
||||
prompt (Union[str, List[str]]): Input prompt or list of prompts
|
||||
device (torch.device): Device to run encoding on
|
||||
dtype (torch.dtype): Data type for embeddings
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Pooled text embeddings from CLIP
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder_2.dtype
|
||||
|
||||
inputs = self.tokenizer_2(
|
||||
prompt,
|
||||
max_length=77,
|
||||
truncation=True,
|
||||
add_special_tokens=True,
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
|
||||
pooled_embed = self.text_encoder_2(**inputs)["pooler_output"]
|
||||
|
||||
return pooled_embed.to(dtype)
|
||||
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
image: torch.Tensor,
|
||||
num_images_per_prompt: int = 1,
|
||||
max_sequence_length: int = 1024,
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
r"""
|
||||
Encodes a single prompt (positive or negative) into text encoder hidden states.
|
||||
|
||||
This method combines embeddings from both Qwen2.5-VL and CLIP text encoders to create comprehensive text
|
||||
representations for image generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
Prompt to be encoded.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
Number of images to generate per prompt.
|
||||
max_sequence_length (`int`, *optional*, defaults to 1024):
|
||||
Maximum sequence length for text encoding. Must be less than 1024
|
||||
device (`torch.device`, *optional*):
|
||||
Torch device.
|
||||
dtype (`torch.dtype`, *optional*):
|
||||
Torch dtype.
|
||||
|
||||
Returns:
|
||||
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
- Qwen text embeddings of shape (batch_size * num_images_per_prompt, sequence_length, embedding_dim)
|
||||
- CLIP pooled embeddings of shape (batch_size * num_images_per_prompt, clip_embedding_dim)
|
||||
- Cumulative sequence lengths (`cu_seqlens`) for Qwen embeddings of shape (batch_size *
|
||||
num_images_per_prompt + 1,)
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
if not isinstance(prompt, list):
|
||||
prompt = [prompt]
|
||||
|
||||
batch_size = len(prompt)
|
||||
|
||||
prompt = [prompt_clean(p) for p in prompt]
|
||||
|
||||
# Encode with Qwen2.5-VL
|
||||
prompt_embeds_qwen, prompt_cu_seqlens = self._encode_prompt_qwen(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
device=device,
|
||||
max_sequence_length=max_sequence_length,
|
||||
dtype=dtype,
|
||||
)
|
||||
# prompt_embeds_qwen shape: [batch_size, seq_len, embed_dim]
|
||||
|
||||
# Encode with CLIP
|
||||
prompt_embeds_clip = self._encode_prompt_clip(
|
||||
prompt=prompt,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
# prompt_embeds_clip shape: [batch_size, clip_embed_dim]
|
||||
|
||||
# Repeat embeddings for num_images_per_prompt
|
||||
# Qwen embeddings: repeat sequence for each image, then reshape
|
||||
prompt_embeds_qwen = prompt_embeds_qwen.repeat(
|
||||
1, num_images_per_prompt, 1
|
||||
) # [batch_size, seq_len * num_images_per_prompt, embed_dim]
|
||||
# Reshape to [batch_size * num_images_per_prompt, seq_len, embed_dim]
|
||||
prompt_embeds_qwen = prompt_embeds_qwen.view(
|
||||
batch_size * num_images_per_prompt, -1, prompt_embeds_qwen.shape[-1]
|
||||
)
|
||||
|
||||
# CLIP embeddings: repeat for each image
|
||||
prompt_embeds_clip = prompt_embeds_clip.repeat(
|
||||
1, num_images_per_prompt, 1
|
||||
) # [batch_size, num_images_per_prompt, clip_embed_dim]
|
||||
# Reshape to [batch_size * num_images_per_prompt, clip_embed_dim]
|
||||
prompt_embeds_clip = prompt_embeds_clip.view(batch_size * num_images_per_prompt, -1)
|
||||
|
||||
# Repeat cumulative sequence lengths for num_images_per_prompt
|
||||
# Original differences (lengths) for each prompt in the batch
|
||||
original_lengths = prompt_cu_seqlens.diff() # [len1, len2, ...]
|
||||
# Repeat the lengths for num_images_per_prompt
|
||||
repeated_lengths = original_lengths.repeat_interleave(
|
||||
num_images_per_prompt
|
||||
) # [len1, len1, ..., len2, len2, ...]
|
||||
# Reconstruct the cumulative lengths
|
||||
repeated_cu_seqlens = torch.cat(
|
||||
[torch.tensor([0], device=device, dtype=torch.int32), repeated_lengths.cumsum(0)]
|
||||
)
|
||||
|
||||
return prompt_embeds_qwen, prompt_embeds_clip, repeated_cu_seqlens
|
||||
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
negative_prompt,
|
||||
image,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds_qwen=None,
|
||||
prompt_embeds_clip=None,
|
||||
negative_prompt_embeds_qwen=None,
|
||||
negative_prompt_embeds_clip=None,
|
||||
prompt_cu_seqlens=None,
|
||||
negative_prompt_cu_seqlens=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
max_sequence_length=None,
|
||||
):
|
||||
"""
|
||||
Validate input parameters for the pipeline.
|
||||
|
||||
Args:
|
||||
prompt: Input prompt
|
||||
negative_prompt: Negative prompt for guidance
|
||||
image: Input image for conditioning
|
||||
height: Image height
|
||||
width: Image width
|
||||
prompt_embeds_qwen: Pre-computed Qwen prompt embeddings
|
||||
prompt_embeds_clip: Pre-computed CLIP prompt embeddings
|
||||
negative_prompt_embeds_qwen: Pre-computed Qwen negative prompt embeddings
|
||||
negative_prompt_embeds_clip: Pre-computed CLIP negative prompt embeddings
|
||||
prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen positive prompt
|
||||
negative_prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen negative prompt
|
||||
callback_on_step_end_tensor_inputs: Callback tensor inputs
|
||||
|
||||
Raises:
|
||||
ValueError: If inputs are invalid
|
||||
"""
|
||||
|
||||
if max_sequence_length is not None and max_sequence_length > 1024:
|
||||
raise ValueError("max_sequence_length must be less than 1024")
|
||||
|
||||
if image is None:
|
||||
raise ValueError("`image` must be provided for image-to-image generation")
|
||||
|
||||
if (width, height) not in self.resolutions:
|
||||
resolutions_str = ",".join([f"({w},{h})" for w, h in self.resolutions])
|
||||
logger.warning(
|
||||
f"`height` and `width` have to be one of {resolutions_str}, but are {height} and {width}. Dimensions will be resized accordingly"
|
||||
)
|
||||
|
||||
if callback_on_step_end_tensor_inputs is not None and not all(
|
||||
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
|
||||
)
|
||||
|
||||
# Check for consistency within positive prompt embeddings and sequence lengths
|
||||
if prompt_embeds_qwen is not None or prompt_embeds_clip is not None or prompt_cu_seqlens is not None:
|
||||
if prompt_embeds_qwen is None or prompt_embeds_clip is None or prompt_cu_seqlens is None:
|
||||
raise ValueError(
|
||||
"If any of `prompt_embeds_qwen`, `prompt_embeds_clip`, or `prompt_cu_seqlens` is provided, "
|
||||
"all three must be provided."
|
||||
)
|
||||
|
||||
# Check for consistency within negative prompt embeddings and sequence lengths
|
||||
if (
|
||||
negative_prompt_embeds_qwen is not None
|
||||
or negative_prompt_embeds_clip is not None
|
||||
or negative_prompt_cu_seqlens is not None
|
||||
):
|
||||
if (
|
||||
negative_prompt_embeds_qwen is None
|
||||
or negative_prompt_embeds_clip is None
|
||||
or negative_prompt_cu_seqlens is None
|
||||
):
|
||||
raise ValueError(
|
||||
"If any of `negative_prompt_embeds_qwen`, `negative_prompt_embeds_clip`, or `negative_prompt_cu_seqlens` is provided, "
|
||||
"all three must be provided."
|
||||
)
|
||||
|
||||
# Check if prompt or embeddings are provided (either prompt or all required embedding components for positive)
|
||||
if prompt is None and prompt_embeds_qwen is None:
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds_qwen` (and corresponding `prompt_embeds_clip` and `prompt_cu_seqlens`). Cannot leave all undefined."
|
||||
)
|
||||
|
||||
# Validate types for prompt and negative_prompt if provided
|
||||
if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
if negative_prompt is not None and (
|
||||
not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
|
||||
):
|
||||
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
image: PipelineImageInput,
|
||||
batch_size: int,
|
||||
num_channels_latents: int = 16,
|
||||
height: int = 1024,
|
||||
width: int = 1024,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Prepare initial latent variables for image-to-image generation.
|
||||
|
||||
This method creates random noise latents with encoded image,
|
||||
|
||||
Args:
|
||||
image (PipelineImageInput): Input image to condition the generation on
|
||||
batch_size (int): Number of images to generate
|
||||
num_channels_latents (int): Number of channels in latent space
|
||||
height (int): Height of generated image
|
||||
width (int): Width of generated image
|
||||
dtype (torch.dtype): Data type for latents
|
||||
device (torch.device): Device to create latents on
|
||||
generator (torch.Generator): Random number generator
|
||||
latents (torch.Tensor): Pre-existing latents to use
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Prepared latent tensor with encoded image
|
||||
"""
|
||||
if latents is not None:
|
||||
return latents.to(device=device, dtype=dtype)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
1,
|
||||
int(height) // self.vae_scale_factor_spatial,
|
||||
int(width) // self.vae_scale_factor_spatial,
|
||||
num_channels_latents,
|
||||
)
|
||||
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
# Generate random noise for all frames
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
|
||||
# Encode the input image to use as first frame
|
||||
# Preprocess image
|
||||
image_tensor = self.image_processor.preprocess(image, height=height, width=width).to(device, dtype=dtype)
|
||||
# Encode image to latents using VAE
|
||||
with torch.no_grad():
|
||||
image_latents = self.vae.encode(image_tensor).latent_dist.sample(generator=generator)
|
||||
image_latents = image_latents.unsqueeze(2) # Add temporal dimension
|
||||
|
||||
# Normalize latents if needed
|
||||
if hasattr(self.vae.config, "scaling_factor"):
|
||||
image_latents = image_latents * self.vae.config.scaling_factor
|
||||
|
||||
# Reshape to match latent dimensions [batch, 1, height, width, channels]
|
||||
image_latents = image_latents.permute(0, 2, 3, 4, 1) # [batch, 1, H, W, C]
|
||||
latents = torch.cat([latents, image_latents, torch.ones_like(latents[..., :1])], -1)
|
||||
|
||||
return latents
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
"""Get the current guidance scale value."""
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
"""Get the number of denoising timesteps."""
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
"""Check if generation has been interrupted."""
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
image: PipelineImageInput,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 3.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
prompt_embeds_qwen: Optional[torch.Tensor] = None,
|
||||
prompt_embeds_clip: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds_qwen: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds_clip: Optional[torch.Tensor] = None,
|
||||
prompt_cu_seqlens: Optional[torch.Tensor] = None,
|
||||
negative_prompt_cu_seqlens: Optional[torch.Tensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 1024,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for image-to-image generation.
|
||||
|
||||
Args:
|
||||
image (`PipelineImageInput`):
|
||||
The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
|
||||
instead. Ignored when not using guidance (`guidance_scale` < `1`).
|
||||
height (`int`):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, defaults to `50`):
|
||||
The number of denoising steps.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in classifier-free guidance.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
A torch generator to make generation deterministic.
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents.
|
||||
prompt_embeds_qwen (`torch.Tensor`, *optional*):
|
||||
Pre-generated Qwen text embeddings.
|
||||
prompt_embeds_clip (`torch.Tensor`, *optional*):
|
||||
Pre-generated CLIP text embeddings.
|
||||
negative_prompt_embeds_qwen (`torch.Tensor`, *optional*):
|
||||
Pre-generated Qwen negative text embeddings.
|
||||
negative_prompt_embeds_clip (`torch.Tensor`, *optional*):
|
||||
Pre-generated CLIP negative text embeddings.
|
||||
prompt_cu_seqlens (`torch.Tensor`, *optional*):
|
||||
Pre-generated cumulative sequence lengths for Qwen positive prompt.
|
||||
negative_prompt_cu_seqlens (`torch.Tensor`, *optional*):
|
||||
Pre-generated cumulative sequence lengths for Qwen negative prompt.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`KandinskyImagePipelineOutput`].
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function that is called at the end of each denoising step.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function.
|
||||
max_sequence_length (`int`, defaults to `1024`):
|
||||
The maximum sequence length for text and image qwen encoding. Must be less than 1024
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~KandinskyImagePipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`KandinskyImagePipelineOutput`] is returned, otherwise a `tuple` is
|
||||
returned where the first element is a list with the generated images.
|
||||
"""
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
if height is None and width is None:
|
||||
width, height = image[0].size if isinstance(image, list) else image.size
|
||||
self.check_inputs(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=image,
|
||||
height=height,
|
||||
width=width,
|
||||
prompt_embeds_qwen=prompt_embeds_qwen,
|
||||
prompt_embeds_clip=prompt_embeds_clip,
|
||||
negative_prompt_embeds_qwen=negative_prompt_embeds_qwen,
|
||||
negative_prompt_embeds_clip=negative_prompt_embeds_clip,
|
||||
prompt_cu_seqlens=prompt_cu_seqlens,
|
||||
negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
if (width, height) not in self.resolutions:
|
||||
width, height = self.resolutions[
|
||||
np.argmin([abs((i[0] / i[1]) - (width / height)) for i in self.resolutions])
|
||||
]
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._interrupt = False
|
||||
|
||||
device = self._execution_device
|
||||
dtype = self.transformer.dtype
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
prompt = [prompt]
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds_qwen.shape[0]
|
||||
|
||||
# 3. Encode input prompt
|
||||
if prompt_embeds_qwen is None:
|
||||
prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = self.encode_prompt(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
if self.guidance_scale > 1.0:
|
||||
if negative_prompt is None:
|
||||
negative_prompt = ""
|
||||
|
||||
if isinstance(negative_prompt, str):
|
||||
negative_prompt = [negative_prompt] * len(prompt) if prompt is not None else [negative_prompt]
|
||||
elif len(negative_prompt) != len(prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt` must have same length as `prompt`. Got {len(negative_prompt)} vs {len(prompt)}."
|
||||
)
|
||||
|
||||
if negative_prompt_embeds_qwen is None:
|
||||
negative_prompt_embeds_qwen, negative_prompt_embeds_clip, negative_prompt_cu_seqlens = (
|
||||
self.encode_prompt(
|
||||
prompt=negative_prompt,
|
||||
image=image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
)
|
||||
|
||||
# 4. Prepare timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
# 5. Prepare latent variables with image conditioning
|
||||
num_channels_latents = self.transformer.config.in_visual_dim
|
||||
latents = self.prepare_latents(
|
||||
image=image,
|
||||
batch_size=batch_size * num_images_per_prompt,
|
||||
num_channels_latents=num_channels_latents,
|
||||
height=height,
|
||||
width=width,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
)
|
||||
|
||||
# 6. Prepare rope positions for positional encoding
|
||||
visual_rope_pos = [
|
||||
torch.arange(1, device=device),
|
||||
torch.arange(height // self.vae_scale_factor_spatial // 2, device=device),
|
||||
torch.arange(width // self.vae_scale_factor_spatial // 2, device=device),
|
||||
]
|
||||
|
||||
text_rope_pos = torch.arange(prompt_cu_seqlens.diff().max().item(), device=device)
|
||||
|
||||
negative_text_rope_pos = (
|
||||
torch.arange(negative_prompt_cu_seqlens.diff().max().item(), device=device)
|
||||
if negative_prompt_cu_seqlens is not None
|
||||
else None
|
||||
)
|
||||
|
||||
# 7. Calculate dynamic scale factor based on resolution
|
||||
scale_factor = [1.0, 1.0, 1.0]
|
||||
|
||||
# 8. Sparse Params for efficient attention
|
||||
sparse_params = None
|
||||
|
||||
# 9. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
timestep = t.unsqueeze(0).repeat(batch_size * num_images_per_prompt)
|
||||
|
||||
# Predict noise residual
|
||||
pred_velocity = self.transformer(
|
||||
hidden_states=latents.to(dtype),
|
||||
encoder_hidden_states=prompt_embeds_qwen.to(dtype),
|
||||
pooled_projections=prompt_embeds_clip.to(dtype),
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=text_rope_pos,
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
|
||||
if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
|
||||
uncond_pred_velocity = self.transformer(
|
||||
hidden_states=latents.to(dtype),
|
||||
encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
|
||||
pooled_projections=negative_prompt_embeds_clip.to(dtype),
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=negative_text_rope_pos,
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
|
||||
pred_velocity = uncond_pred_velocity + guidance_scale * (pred_velocity - uncond_pred_velocity)
|
||||
|
||||
latents[:, :, :, :, :num_channels_latents] = self.scheduler.step(
|
||||
pred_velocity[:, :], t, latents[:, :, :, :, :num_channels_latents], return_dict=False
|
||||
)[0]
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds_qwen = callback_outputs.pop("prompt_embeds_qwen", prompt_embeds_qwen)
|
||||
prompt_embeds_clip = callback_outputs.pop("prompt_embeds_clip", prompt_embeds_clip)
|
||||
negative_prompt_embeds_qwen = callback_outputs.pop(
|
||||
"negative_prompt_embeds_qwen", negative_prompt_embeds_qwen
|
||||
)
|
||||
negative_prompt_embeds_clip = callback_outputs.pop(
|
||||
"negative_prompt_embeds_clip", negative_prompt_embeds_clip
|
||||
)
|
||||
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
# 9. Post-processing - extract main latents
|
||||
latents = latents[:, :, :, :, :num_channels_latents]
|
||||
|
||||
# 10. Decode latents to image
|
||||
if output_type != "latent":
|
||||
latents = latents.to(self.vae.dtype)
|
||||
# Reshape and normalize latents
|
||||
latents = latents.reshape(
|
||||
batch_size,
|
||||
num_images_per_prompt,
|
||||
1,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
num_channels_latents,
|
||||
)
|
||||
latents = latents.permute(0, 1, 5, 2, 3, 4) # [batch, num_images, channels, 1, height, width]
|
||||
latents = latents.reshape(
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
)
|
||||
|
||||
# Normalize and decode through VAE
|
||||
latents = latents / self.vae.config.scaling_factor
|
||||
image = self.vae.decode(latents).sample
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
return KandinskyImagePipelineOutput(image=image)
|
||||
1054
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2v.py
Normal file
1054
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_i2v.py
Normal file
File diff suppressed because it is too large
Load Diff
818
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_t2i.py
Normal file
818
src/diffusers/pipelines/kandinsky5/pipeline_kandinsky_t2i.py
Normal file
@@ -0,0 +1,818 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import html
|
||||
from typing import Callable, Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import regex as re
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
from transformers import CLIPTextModel, CLIPTokenizer, Qwen2_5_VLForConditionalGeneration, Qwen2VLProcessor
|
||||
|
||||
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
||||
from ...image_processor import VaeImageProcessor
|
||||
from ...loaders import KandinskyLoraLoaderMixin
|
||||
from ...models import AutoencoderKL
|
||||
from ...models.transformers import Kandinsky5Transformer3DModel
|
||||
from ...schedulers import FlowMatchEulerDiscreteScheduler
|
||||
|
||||
# Add imports for offloading and tiling
|
||||
from ...utils import (
|
||||
is_ftfy_available,
|
||||
is_torch_xla_available,
|
||||
logging,
|
||||
replace_example_docstring,
|
||||
)
|
||||
from ...utils.torch_utils import randn_tensor
|
||||
from ..pipeline_utils import DiffusionPipeline
|
||||
from .pipeline_output import KandinskyImagePipelineOutput
|
||||
|
||||
|
||||
if is_torch_xla_available():
|
||||
import torch_xla.core.xla_model as xm
|
||||
|
||||
XLA_AVAILABLE = True
|
||||
else:
|
||||
XLA_AVAILABLE = False
|
||||
|
||||
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
if is_ftfy_available():
|
||||
import ftfy
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
EXAMPLE_DOC_STRING = """
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> import torch
|
||||
>>> from diffusers import Kandinsky5T2IPipeline
|
||||
|
||||
>>> # Available models:
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers
|
||||
>>> # kandinskylab/Kandinsky-5.0-T2I-Lite-pretrain-Diffusers
|
||||
|
||||
>>> model_id = "kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers"
|
||||
>>> pipe = Kandinsky5T2IPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> prompt = "A cat and a dog baking a cake together in a kitchen."
|
||||
|
||||
>>> output = pipe(
|
||||
... prompt=prompt,
|
||||
... negative_prompt="",
|
||||
... height=1024,
|
||||
... width=1024,
|
||||
... num_inference_steps=50,
|
||||
... guidance_scale=3.5,
|
||||
... ).frames[0]
|
||||
```
|
||||
"""
|
||||
|
||||
|
||||
def basic_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Clean text using ftfy if available and unescape HTML entities.
|
||||
"""
|
||||
if is_ftfy_available():
|
||||
text = ftfy.fix_text(text)
|
||||
text = html.unescape(html.unescape(text))
|
||||
return text.strip()
|
||||
|
||||
|
||||
def whitespace_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Normalize whitespace in text by replacing multiple spaces with single space.
|
||||
"""
|
||||
text = re.sub(r"\s+", " ", text)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
|
||||
def prompt_clean(text):
|
||||
"""
|
||||
Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/wan/pipeline_wan.py
|
||||
|
||||
Apply both basic cleaning and whitespace normalization to prompts.
|
||||
"""
|
||||
text = whitespace_clean(basic_clean(text))
|
||||
return text
|
||||
|
||||
|
||||
class Kandinsky5T2IPipeline(DiffusionPipeline, KandinskyLoraLoaderMixin):
|
||||
r"""
|
||||
Pipeline for text-to-image generation using Kandinsky 5.0.
|
||||
|
||||
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
||||
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
||||
|
||||
Args:
|
||||
transformer ([`Kandinsky5Transformer3DModel`]):
|
||||
Conditional Transformer to denoise the encoded image latents.
|
||||
vae ([`AutoencoderKL`]):
|
||||
Variational Auto-Encoder Model [black-forest-labs/FLUX.1-dev
|
||||
(vae)](https://huggingface.co/black-forest-labs/FLUX.1-dev) to encode and decode videos to and from latent
|
||||
representations.
|
||||
text_encoder ([`Qwen2_5_VLForConditionalGeneration`]):
|
||||
Frozen text-encoder [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct).
|
||||
tokenizer ([`AutoProcessor`]):
|
||||
Tokenizer for Qwen2.5-VL.
|
||||
text_encoder_2 ([`CLIPTextModel`]):
|
||||
Frozen [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),
|
||||
specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
|
||||
tokenizer_2 ([`CLIPTokenizer`]):
|
||||
Tokenizer for CLIP.
|
||||
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
|
||||
A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
|
||||
"""
|
||||
|
||||
model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
|
||||
_callback_tensor_inputs = [
|
||||
"latents",
|
||||
"prompt_embeds_qwen",
|
||||
"prompt_embeds_clip",
|
||||
"negative_prompt_embeds_qwen",
|
||||
"negative_prompt_embeds_clip",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transformer: Kandinsky5Transformer3DModel,
|
||||
vae: AutoencoderKL,
|
||||
text_encoder: Qwen2_5_VLForConditionalGeneration,
|
||||
tokenizer: Qwen2VLProcessor,
|
||||
text_encoder_2: CLIPTextModel,
|
||||
tokenizer_2: CLIPTokenizer,
|
||||
scheduler: FlowMatchEulerDiscreteScheduler,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.register_modules(
|
||||
transformer=transformer,
|
||||
vae=vae,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
text_encoder_2=text_encoder_2,
|
||||
tokenizer_2=tokenizer_2,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
|
||||
self.prompt_template = "<|im_start|>system\nYou are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>"
|
||||
self.prompt_template_encode_start_idx = 41
|
||||
|
||||
self.vae_scale_factor_spatial = 8
|
||||
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
||||
self.resolutions = [(1024, 1024), (640, 1408), (1408, 640), (768, 1280), (1280, 768), (896, 1152), (1152, 896)]
|
||||
|
||||
def _encode_prompt_qwen(
|
||||
self,
|
||||
prompt: List[str],
|
||||
device: Optional[torch.device] = None,
|
||||
max_sequence_length: int = 512,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
"""
|
||||
Encode prompt using Qwen2.5-VL text encoder.
|
||||
|
||||
This method processes the input prompt through the Qwen2.5-VL model to generate text embeddings suitable for
|
||||
image generation.
|
||||
|
||||
Args:
|
||||
prompt List[str]: Input list of prompts
|
||||
device (torch.device): Device to run encoding on
|
||||
max_sequence_length (int): Maximum sequence length for tokenization
|
||||
dtype (torch.dtype): Data type for embeddings
|
||||
|
||||
Returns:
|
||||
Tuple[torch.Tensor, torch.Tensor]: Text embeddings and cumulative sequence lengths
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
full_texts = [self.prompt_template.format(p) for p in prompt]
|
||||
max_allowed_len = self.prompt_template_encode_start_idx + max_sequence_length
|
||||
|
||||
untruncated_ids = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=None,
|
||||
videos=None,
|
||||
return_tensors="pt",
|
||||
padding="longest",
|
||||
)["input_ids"]
|
||||
|
||||
if untruncated_ids.shape[-1] > max_allowed_len:
|
||||
for i, text in enumerate(full_texts):
|
||||
tokens = untruncated_ids[i][self.prompt_template_encode_start_idx : -2]
|
||||
removed_text = self.tokenizer.decode(tokens[max_sequence_length - 2 :])
|
||||
if len(removed_text) > 0:
|
||||
full_texts[i] = text[: -len(removed_text)]
|
||||
logger.warning(
|
||||
"The following part of your input was truncated because `max_sequence_length` is set to "
|
||||
f" {max_sequence_length} tokens: {removed_text}"
|
||||
)
|
||||
|
||||
inputs = self.tokenizer(
|
||||
text=full_texts,
|
||||
images=None,
|
||||
videos=None,
|
||||
max_length=max_allowed_len,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
).to(device)
|
||||
|
||||
embeds = self.text_encoder(
|
||||
input_ids=inputs["input_ids"],
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)["hidden_states"][-1][:, self.prompt_template_encode_start_idx :]
|
||||
attention_mask = inputs["attention_mask"][:, self.prompt_template_encode_start_idx :]
|
||||
cu_seqlens = torch.cumsum(attention_mask.sum(1), dim=0)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).to(dtype=torch.int32)
|
||||
|
||||
return embeds.to(dtype), cu_seqlens
|
||||
|
||||
def _encode_prompt_clip(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
"""
|
||||
Encode prompt using CLIP text encoder.
|
||||
|
||||
This method processes the input prompt through the CLIP model to generate pooled embeddings that capture
|
||||
semantic information.
|
||||
|
||||
Args:
|
||||
prompt (Union[str, List[str]]): Input prompt or list of prompts
|
||||
device (torch.device): Device to run encoding on
|
||||
dtype (torch.dtype): Data type for embeddings
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Pooled text embeddings from CLIP
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder_2.dtype
|
||||
|
||||
inputs = self.tokenizer_2(
|
||||
prompt,
|
||||
max_length=77,
|
||||
truncation=True,
|
||||
add_special_tokens=True,
|
||||
padding="max_length",
|
||||
return_tensors="pt",
|
||||
).to(device)
|
||||
|
||||
pooled_embed = self.text_encoder_2(**inputs)["pooler_output"]
|
||||
|
||||
return pooled_embed.to(dtype)
|
||||
|
||||
def encode_prompt(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
num_images_per_prompt: int = 1,
|
||||
max_sequence_length: int = 512,
|
||||
device: Optional[torch.device] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
):
|
||||
r"""
|
||||
Encodes a single prompt (positive or negative) into text encoder hidden states.
|
||||
|
||||
This method combines embeddings from both Qwen2.5-VL and CLIP text encoders to create comprehensive text
|
||||
representations for image generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
Prompt to be encoded.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
Number of images to generate per prompt.
|
||||
max_sequence_length (`int`, *optional*, defaults to 512):
|
||||
Maximum sequence length for text encoding. Must be less than 1024
|
||||
device (`torch.device`, *optional*):
|
||||
Torch device.
|
||||
dtype (`torch.dtype`, *optional*):
|
||||
Torch dtype.
|
||||
|
||||
Returns:
|
||||
Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||
- Qwen text embeddings of shape (batch_size * num_images_per_prompt, sequence_length, embedding_dim)
|
||||
- CLIP pooled embeddings of shape (batch_size * num_images_per_prompt, clip_embedding_dim)
|
||||
- Cumulative sequence lengths (`cu_seqlens`) for Qwen embeddings of shape (batch_size *
|
||||
num_images_per_prompt + 1,)
|
||||
"""
|
||||
device = device or self._execution_device
|
||||
dtype = dtype or self.text_encoder.dtype
|
||||
|
||||
if not isinstance(prompt, list):
|
||||
prompt = [prompt]
|
||||
|
||||
batch_size = len(prompt)
|
||||
|
||||
prompt = [prompt_clean(p) for p in prompt]
|
||||
|
||||
# Encode with Qwen2.5-VL
|
||||
prompt_embeds_qwen, prompt_cu_seqlens = self._encode_prompt_qwen(
|
||||
prompt=prompt,
|
||||
device=device,
|
||||
max_sequence_length=max_sequence_length,
|
||||
dtype=dtype,
|
||||
)
|
||||
# prompt_embeds_qwen shape: [batch_size, seq_len, embed_dim]
|
||||
|
||||
# Encode with CLIP
|
||||
prompt_embeds_clip = self._encode_prompt_clip(
|
||||
prompt=prompt,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
# prompt_embeds_clip shape: [batch_size, clip_embed_dim]
|
||||
|
||||
# Repeat embeddings for num_images_per_prompt
|
||||
# Qwen embeddings: repeat sequence for each image, then reshape
|
||||
prompt_embeds_qwen = prompt_embeds_qwen.repeat(
|
||||
1, num_images_per_prompt, 1
|
||||
) # [batch_size, seq_len * num_images_per_prompt, embed_dim]
|
||||
# Reshape to [batch_size * num_images_per_prompt, seq_len, embed_dim]
|
||||
prompt_embeds_qwen = prompt_embeds_qwen.view(
|
||||
batch_size * num_images_per_prompt, -1, prompt_embeds_qwen.shape[-1]
|
||||
)
|
||||
|
||||
# CLIP embeddings: repeat for each image
|
||||
prompt_embeds_clip = prompt_embeds_clip.repeat(
|
||||
1, num_images_per_prompt, 1
|
||||
) # [batch_size, num_images_per_prompt, clip_embed_dim]
|
||||
# Reshape to [batch_size * num_images_per_prompt, clip_embed_dim]
|
||||
prompt_embeds_clip = prompt_embeds_clip.view(batch_size * num_images_per_prompt, -1)
|
||||
|
||||
# Repeat cumulative sequence lengths for num_images_per_prompt
|
||||
# Original differences (lengths) for each prompt in the batch
|
||||
original_lengths = prompt_cu_seqlens.diff() # [len1, len2, ...]
|
||||
# Repeat the lengths for num_images_per_prompt
|
||||
repeated_lengths = original_lengths.repeat_interleave(
|
||||
num_images_per_prompt
|
||||
) # [len1, len1, ..., len2, len2, ...]
|
||||
# Reconstruct the cumulative lengths
|
||||
repeated_cu_seqlens = torch.cat(
|
||||
[torch.tensor([0], device=device, dtype=torch.int32), repeated_lengths.cumsum(0)]
|
||||
)
|
||||
|
||||
return prompt_embeds_qwen, prompt_embeds_clip, repeated_cu_seqlens
|
||||
|
||||
def check_inputs(
|
||||
self,
|
||||
prompt,
|
||||
negative_prompt,
|
||||
height,
|
||||
width,
|
||||
prompt_embeds_qwen=None,
|
||||
prompt_embeds_clip=None,
|
||||
negative_prompt_embeds_qwen=None,
|
||||
negative_prompt_embeds_clip=None,
|
||||
prompt_cu_seqlens=None,
|
||||
negative_prompt_cu_seqlens=None,
|
||||
callback_on_step_end_tensor_inputs=None,
|
||||
max_sequence_length=None,
|
||||
):
|
||||
"""
|
||||
Validate input parameters for the pipeline.
|
||||
|
||||
Args:
|
||||
prompt: Input prompt
|
||||
negative_prompt: Negative prompt for guidance
|
||||
height: Image height
|
||||
width: Image width
|
||||
prompt_embeds_qwen: Pre-computed Qwen prompt embeddings
|
||||
prompt_embeds_clip: Pre-computed CLIP prompt embeddings
|
||||
negative_prompt_embeds_qwen: Pre-computed Qwen negative prompt embeddings
|
||||
negative_prompt_embeds_clip: Pre-computed CLIP negative prompt embeddings
|
||||
prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen positive prompt
|
||||
negative_prompt_cu_seqlens: Pre-computed cumulative sequence lengths for Qwen negative prompt
|
||||
callback_on_step_end_tensor_inputs: Callback tensor inputs
|
||||
|
||||
Raises:
|
||||
ValueError: If inputs are invalid
|
||||
"""
|
||||
|
||||
if max_sequence_length is not None and max_sequence_length > 1024:
|
||||
raise ValueError("max_sequence_length must be less than 1024")
|
||||
|
||||
if (width, height) not in self.resolutions:
|
||||
resolutions_str = ",".join([f"({w},{h})" for w, h in self.resolutions])
|
||||
logger.warning(
|
||||
f"`height` and `width` have to be one of {resolutions_str}, but are {height} and {width}. Dimensions will be resized accordingly"
|
||||
)
|
||||
|
||||
if callback_on_step_end_tensor_inputs is not None and not all(
|
||||
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
|
||||
):
|
||||
raise ValueError(
|
||||
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
|
||||
)
|
||||
|
||||
# Check for consistency within positive prompt embeddings and sequence lengths
|
||||
if prompt_embeds_qwen is not None or prompt_embeds_clip is not None or prompt_cu_seqlens is not None:
|
||||
if prompt_embeds_qwen is None or prompt_embeds_clip is None or prompt_cu_seqlens is None:
|
||||
raise ValueError(
|
||||
"If any of `prompt_embeds_qwen`, `prompt_embeds_clip`, or `prompt_cu_seqlens` is provided, "
|
||||
"all three must be provided."
|
||||
)
|
||||
|
||||
# Check for consistency within negative prompt embeddings and sequence lengths
|
||||
if (
|
||||
negative_prompt_embeds_qwen is not None
|
||||
or negative_prompt_embeds_clip is not None
|
||||
or negative_prompt_cu_seqlens is not None
|
||||
):
|
||||
if (
|
||||
negative_prompt_embeds_qwen is None
|
||||
or negative_prompt_embeds_clip is None
|
||||
or negative_prompt_cu_seqlens is None
|
||||
):
|
||||
raise ValueError(
|
||||
"If any of `negative_prompt_embeds_qwen`, `negative_prompt_embeds_clip`, or `negative_prompt_cu_seqlens` is provided, "
|
||||
"all three must be provided."
|
||||
)
|
||||
|
||||
# Check if prompt or embeddings are provided (either prompt or all required embedding components for positive)
|
||||
if prompt is None and prompt_embeds_qwen is None:
|
||||
raise ValueError(
|
||||
"Provide either `prompt` or `prompt_embeds_qwen` (and corresponding `prompt_embeds_clip` and `prompt_cu_seqlens`). Cannot leave all undefined."
|
||||
)
|
||||
|
||||
# Validate types for prompt and negative_prompt if provided
|
||||
if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
|
||||
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
|
||||
if negative_prompt is not None and (
|
||||
not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
|
||||
):
|
||||
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
|
||||
|
||||
def prepare_latents(
|
||||
self,
|
||||
batch_size: int,
|
||||
num_channels_latents: int = 16,
|
||||
height: int = 1024,
|
||||
width: int = 1024,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
device: Optional[torch.device] = None,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Prepare initial latent variables for text-to-image generation.
|
||||
|
||||
This method creates random noise latents
|
||||
|
||||
Args:
|
||||
batch_size (int): Number of images to generate
|
||||
num_channels_latents (int): Number of channels in latent space
|
||||
height (int): Height of generated image
|
||||
width (int): Width of generated image
|
||||
dtype (torch.dtype): Data type for latents
|
||||
device (torch.device): Device to create latents on
|
||||
generator (torch.Generator): Random number generator
|
||||
latents (torch.Tensor): Pre-existing latents to use
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Prepared latent tensor
|
||||
"""
|
||||
if latents is not None:
|
||||
return latents.to(device=device, dtype=dtype)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
1,
|
||||
int(height) // self.vae_scale_factor_spatial,
|
||||
int(width) // self.vae_scale_factor_spatial,
|
||||
num_channels_latents,
|
||||
)
|
||||
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
# Generate random noise
|
||||
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
return latents
|
||||
|
||||
@property
|
||||
def guidance_scale(self):
|
||||
"""Get the current guidance scale value."""
|
||||
return self._guidance_scale
|
||||
|
||||
@property
|
||||
def num_timesteps(self):
|
||||
"""Get the number of denoising timesteps."""
|
||||
return self._num_timesteps
|
||||
|
||||
@property
|
||||
def interrupt(self):
|
||||
"""Check if generation has been interrupted."""
|
||||
return self._interrupt
|
||||
|
||||
@torch.no_grad()
|
||||
@replace_example_docstring(EXAMPLE_DOC_STRING)
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
height: int = 1024,
|
||||
width: int = 1024,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 3.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
||||
latents: Optional[torch.Tensor] = None,
|
||||
prompt_embeds_qwen: Optional[torch.Tensor] = None,
|
||||
prompt_embeds_clip: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds_qwen: Optional[torch.Tensor] = None,
|
||||
negative_prompt_embeds_clip: Optional[torch.Tensor] = None,
|
||||
prompt_cu_seqlens: Optional[torch.Tensor] = None,
|
||||
negative_prompt_cu_seqlens: Optional[torch.Tensor] = None,
|
||||
output_type: Optional[str] = "pil",
|
||||
return_dict: bool = True,
|
||||
callback_on_step_end: Optional[
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
max_sequence_length: int = 512,
|
||||
):
|
||||
r"""
|
||||
The call function to the pipeline for text-to-image generation.
|
||||
|
||||
Args:
|
||||
prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to guide the image generation. If not defined, pass `prompt_embeds` instead.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts to avoid during image generation. If not defined, pass `negative_prompt_embeds`
|
||||
instead. Ignored when not using guidance (`guidance_scale` < `1`).
|
||||
height (`int`, defaults to `1024`):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, defaults to `1024`):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, defaults to `50`):
|
||||
The number of denoising steps.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in classifier-free guidance.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
A torch generator to make generation deterministic.
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents.
|
||||
prompt_embeds_qwen (`torch.Tensor`, *optional*):
|
||||
Pre-generated Qwen text embeddings.
|
||||
prompt_embeds_clip (`torch.Tensor`, *optional*):
|
||||
Pre-generated CLIP text embeddings.
|
||||
negative_prompt_embeds_qwen (`torch.Tensor`, *optional*):
|
||||
Pre-generated Qwen negative text embeddings.
|
||||
negative_prompt_embeds_clip (`torch.Tensor`, *optional*):
|
||||
Pre-generated CLIP negative text embeddings.
|
||||
prompt_cu_seqlens (`torch.Tensor`, *optional*):
|
||||
Pre-generated cumulative sequence lengths for Qwen positive prompt.
|
||||
negative_prompt_cu_seqlens (`torch.Tensor`, *optional*):
|
||||
Pre-generated cumulative sequence lengths for Qwen negative prompt.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generated image.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`KandinskyImagePipelineOutput`].
|
||||
callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
|
||||
A function that is called at the end of each denoising step.
|
||||
callback_on_step_end_tensor_inputs (`List`, *optional*):
|
||||
The list of tensor inputs for the `callback_on_step_end` function.
|
||||
max_sequence_length (`int`, defaults to `512`):
|
||||
The maximum sequence length for text encoding.
|
||||
|
||||
Examples:
|
||||
|
||||
Returns:
|
||||
[`~KandinskyImagePipelineOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`KandinskyImagePipelineOutput`] is returned, otherwise a `tuple` is
|
||||
returned where the first element is a list with the generated images.
|
||||
"""
|
||||
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
||||
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
||||
self.check_inputs(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
prompt_embeds_qwen=prompt_embeds_qwen,
|
||||
prompt_embeds_clip=prompt_embeds_clip,
|
||||
negative_prompt_embeds_qwen=negative_prompt_embeds_qwen,
|
||||
negative_prompt_embeds_clip=negative_prompt_embeds_clip,
|
||||
prompt_cu_seqlens=prompt_cu_seqlens,
|
||||
negative_prompt_cu_seqlens=negative_prompt_cu_seqlens,
|
||||
callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
|
||||
max_sequence_length=max_sequence_length,
|
||||
)
|
||||
if (width, height) not in self.resolutions:
|
||||
width, height = self.resolutions[
|
||||
np.argmin([abs((i[0] / i[1]) - (width / height)) for i in self.resolutions])
|
||||
]
|
||||
|
||||
self._guidance_scale = guidance_scale
|
||||
self._interrupt = False
|
||||
|
||||
device = self._execution_device
|
||||
dtype = self.transformer.dtype
|
||||
|
||||
# 2. Define call parameters
|
||||
if prompt is not None and isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
prompt = [prompt]
|
||||
elif prompt is not None and isinstance(prompt, list):
|
||||
batch_size = len(prompt)
|
||||
else:
|
||||
batch_size = prompt_embeds_qwen.shape[0]
|
||||
|
||||
# 3. Encode input prompt
|
||||
if prompt_embeds_qwen is None:
|
||||
prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = self.encode_prompt(
|
||||
prompt=prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
|
||||
if self.guidance_scale > 1.0:
|
||||
if negative_prompt is None:
|
||||
negative_prompt = ""
|
||||
|
||||
if isinstance(negative_prompt, str):
|
||||
negative_prompt = [negative_prompt] * len(prompt) if prompt is not None else [negative_prompt]
|
||||
elif len(negative_prompt) != len(prompt):
|
||||
raise ValueError(
|
||||
f"`negative_prompt` must have same length as `prompt`. Got {len(negative_prompt)} vs {len(prompt)}."
|
||||
)
|
||||
|
||||
if negative_prompt_embeds_qwen is None:
|
||||
negative_prompt_embeds_qwen, negative_prompt_embeds_clip, negative_prompt_cu_seqlens = (
|
||||
self.encode_prompt(
|
||||
prompt=negative_prompt,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
max_sequence_length=max_sequence_length,
|
||||
device=device,
|
||||
dtype=dtype,
|
||||
)
|
||||
)
|
||||
|
||||
# 4. Prepare timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
timesteps = self.scheduler.timesteps
|
||||
|
||||
# 5. Prepare latent variables
|
||||
num_channels_latents = self.transformer.config.in_visual_dim
|
||||
latents = self.prepare_latents(
|
||||
batch_size=batch_size * num_images_per_prompt,
|
||||
num_channels_latents=num_channels_latents,
|
||||
height=height,
|
||||
width=width,
|
||||
dtype=dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
latents=latents,
|
||||
)
|
||||
|
||||
# 6. Prepare rope positions for positional encoding
|
||||
visual_rope_pos = [
|
||||
torch.arange(1, device=device),
|
||||
torch.arange(height // self.vae_scale_factor_spatial // 2, device=device),
|
||||
torch.arange(width // self.vae_scale_factor_spatial // 2, device=device),
|
||||
]
|
||||
|
||||
text_rope_pos = torch.arange(prompt_cu_seqlens.diff().max().item(), device=device)
|
||||
|
||||
negative_text_rope_pos = (
|
||||
torch.arange(negative_prompt_cu_seqlens.diff().max().item(), device=device)
|
||||
if negative_prompt_cu_seqlens is not None
|
||||
else None
|
||||
)
|
||||
|
||||
# 7. Calculate dynamic scale factor based on resolution
|
||||
scale_factor = [1.0, 1.0, 1.0]
|
||||
|
||||
# 8. Sparse Params for efficient attention
|
||||
sparse_params = None
|
||||
|
||||
# 9. Denoising loop
|
||||
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
||||
self._num_timesteps = len(timesteps)
|
||||
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
||||
for i, t in enumerate(timesteps):
|
||||
if self.interrupt:
|
||||
continue
|
||||
|
||||
timestep = t.unsqueeze(0).repeat(batch_size * num_images_per_prompt)
|
||||
|
||||
# Predict noise residual
|
||||
pred_velocity = self.transformer(
|
||||
hidden_states=latents.to(dtype),
|
||||
encoder_hidden_states=prompt_embeds_qwen.to(dtype),
|
||||
pooled_projections=prompt_embeds_clip.to(dtype),
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=text_rope_pos,
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
|
||||
if self.guidance_scale > 1.0 and negative_prompt_embeds_qwen is not None:
|
||||
uncond_pred_velocity = self.transformer(
|
||||
hidden_states=latents.to(dtype),
|
||||
encoder_hidden_states=negative_prompt_embeds_qwen.to(dtype),
|
||||
pooled_projections=negative_prompt_embeds_clip.to(dtype),
|
||||
timestep=timestep.to(dtype),
|
||||
visual_rope_pos=visual_rope_pos,
|
||||
text_rope_pos=negative_text_rope_pos,
|
||||
scale_factor=scale_factor,
|
||||
sparse_params=sparse_params,
|
||||
return_dict=True,
|
||||
).sample
|
||||
|
||||
pred_velocity = uncond_pred_velocity + guidance_scale * (pred_velocity - uncond_pred_velocity)
|
||||
|
||||
latents = self.scheduler.step(pred_velocity[:, :], t, latents, return_dict=False)[0]
|
||||
|
||||
if callback_on_step_end is not None:
|
||||
callback_kwargs = {}
|
||||
for k in callback_on_step_end_tensor_inputs:
|
||||
callback_kwargs[k] = locals()[k]
|
||||
callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
|
||||
|
||||
latents = callback_outputs.pop("latents", latents)
|
||||
prompt_embeds_qwen = callback_outputs.pop("prompt_embeds_qwen", prompt_embeds_qwen)
|
||||
prompt_embeds_clip = callback_outputs.pop("prompt_embeds_clip", prompt_embeds_clip)
|
||||
negative_prompt_embeds_qwen = callback_outputs.pop(
|
||||
"negative_prompt_embeds_qwen", negative_prompt_embeds_qwen
|
||||
)
|
||||
negative_prompt_embeds_clip = callback_outputs.pop(
|
||||
"negative_prompt_embeds_clip", negative_prompt_embeds_clip
|
||||
)
|
||||
|
||||
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
||||
progress_bar.update()
|
||||
|
||||
if XLA_AVAILABLE:
|
||||
xm.mark_step()
|
||||
|
||||
# 9. Post-processing - extract main latents
|
||||
latents = latents[:, :, :, :, :num_channels_latents]
|
||||
|
||||
# 10. Decode latents to image
|
||||
if output_type != "latent":
|
||||
latents = latents.to(self.vae.dtype)
|
||||
# Reshape and normalize latents
|
||||
latents = latents.reshape(
|
||||
batch_size,
|
||||
num_images_per_prompt,
|
||||
1,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
num_channels_latents,
|
||||
)
|
||||
latents = latents.permute(0, 1, 5, 2, 3, 4) # [batch, num_images, channels, 1, height, width]
|
||||
latents = latents.reshape(
|
||||
batch_size * num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height // self.vae_scale_factor_spatial,
|
||||
width // self.vae_scale_factor_spatial,
|
||||
)
|
||||
|
||||
# Normalize and decode through VAE
|
||||
latents = latents / self.vae.config.scaling_factor
|
||||
image = self.vae.decode(latents).sample
|
||||
image = self.image_processor.postprocess(image, output_type=output_type)
|
||||
else:
|
||||
image = latents
|
||||
|
||||
# Offload all models
|
||||
self.maybe_free_model_hooks()
|
||||
|
||||
if not return_dict:
|
||||
return (image,)
|
||||
|
||||
return KandinskyImagePipelineOutput(image=image)
|
||||
@@ -8,7 +8,7 @@ from diffusers.utils import BaseOutput
|
||||
@dataclass
|
||||
class KandinskyPipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for Wan pipelines.
|
||||
Output class for kandinsky video pipelines.
|
||||
|
||||
Args:
|
||||
frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
|
||||
@@ -18,3 +18,18 @@ class KandinskyPipelineOutput(BaseOutput):
|
||||
"""
|
||||
|
||||
frames: torch.Tensor
|
||||
|
||||
|
||||
@dataclass
|
||||
class KandinskyImagePipelineOutput(BaseOutput):
|
||||
r"""
|
||||
Output class for kandinsky image pipelines.
|
||||
|
||||
Args:
|
||||
image (`torch.Tensor`, `np.ndarray`, or List[PIL.Image.Image]):
|
||||
List of image outputs - It can be a nested list of length `batch_size,` with each sub-list containing
|
||||
denoised PIL image. It can also be a NumPy array or Torch tensor of shape `(batch_size, channels, height,
|
||||
width)`.
|
||||
"""
|
||||
|
||||
image: torch.Tensor
|
||||
|
||||
@@ -1367,6 +1367,51 @@ class Kandinsky3Pipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class Kandinsky5I2IPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class Kandinsky5I2VPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class Kandinsky5T2IPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class Kandinsky5T2VPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team. All rights reserved.
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -16,12 +16,12 @@ import unittest
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
Qwen2VLProcessor,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
@@ -33,9 +33,7 @@ from diffusers import (
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
@@ -44,51 +42,62 @@ enable_full_determinism()
|
||||
|
||||
class Kandinsky5T2VPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = Kandinsky5T2VPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs", "prompt_embeds", "negative_prompt_embeds"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
|
||||
# Define required optional parameters for your pipeline
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
"max_sequence_length",
|
||||
]
|
||||
)
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
|
||||
params = frozenset(["prompt", "height", "width", "num_frames", "num_inference_steps", "guidance_scale"])
|
||||
|
||||
required_optional_params = {
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
"max_sequence_length",
|
||||
}
|
||||
test_xformers_attention = False
|
||||
supports_optional_components = True
|
||||
supports_dduf = False
|
||||
test_attention_slicing = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKLHunyuanVideo(
|
||||
act_fn="silu",
|
||||
block_out_channels=[32, 64],
|
||||
down_block_types=[
|
||||
"HunyuanVideoDownBlock3D",
|
||||
"HunyuanVideoDownBlock3D",
|
||||
],
|
||||
in_channels=3,
|
||||
latent_channels=16,
|
||||
layers_per_block=1,
|
||||
mid_block_add_attention=False,
|
||||
norm_num_groups=32,
|
||||
out_channels=3,
|
||||
scaling_factor=0.476986,
|
||||
spatial_compression_ratio=8,
|
||||
temporal_compression_ratio=4,
|
||||
latent_channels=4,
|
||||
block_out_channels=(8, 8, 8, 8),
|
||||
layers_per_block=1,
|
||||
norm_num_groups=4,
|
||||
up_block_types=[
|
||||
"HunyuanVideoUpBlock3D",
|
||||
"HunyuanVideoUpBlock3D",
|
||||
],
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
|
||||
# Dummy Qwen2.5-VL model
|
||||
config = Qwen2_5_VLConfig(
|
||||
qwen_hidden_size = 32
|
||||
torch.manual_seed(0)
|
||||
qwen_config = Qwen2_5_VLConfig(
|
||||
text_config={
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 16,
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 2,
|
||||
"num_key_value_heads": 2,
|
||||
"rope_scaling": {
|
||||
"mrope_section": [1, 1, 2],
|
||||
"mrope_section": [2, 2, 4],
|
||||
"rope_type": "default",
|
||||
"type": "default",
|
||||
},
|
||||
@@ -96,211 +105,106 @@ class Kandinsky5T2VPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
},
|
||||
vision_config={
|
||||
"depth": 2,
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 16,
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_heads": 2,
|
||||
"out_hidden_size": 16,
|
||||
"out_hidden_size": qwen_hidden_size,
|
||||
},
|
||||
hidden_size=16,
|
||||
hidden_size=qwen_hidden_size,
|
||||
vocab_size=152064,
|
||||
vision_end_token_id=151653,
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(config)
|
||||
tokenizer = Qwen2VLProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
|
||||
tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
# Dummy CLIP model
|
||||
clip_text_encoder_config = CLIPTextConfig(
|
||||
clip_hidden_size = 16
|
||||
torch.manual_seed(0)
|
||||
clip_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
hidden_size=clip_hidden_size,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
projection_dim=clip_hidden_size,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
|
||||
text_encoder_2 = CLIPTextModel(clip_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
torch.manual_seed(0)
|
||||
transformer = Kandinsky5Transformer3DModel(
|
||||
in_visual_dim=4,
|
||||
in_text_dim=16, # Match tiny Qwen2.5-VL hidden size
|
||||
in_text_dim2=32, # Match tiny CLIP hidden size
|
||||
time_dim=32,
|
||||
out_visual_dim=4,
|
||||
in_visual_dim=16,
|
||||
in_text_dim=qwen_hidden_size,
|
||||
in_text_dim2=clip_hidden_size,
|
||||
time_dim=16,
|
||||
out_visual_dim=16,
|
||||
patch_size=(1, 2, 2),
|
||||
model_dim=48,
|
||||
ff_dim=128,
|
||||
model_dim=16,
|
||||
ff_dim=32,
|
||||
num_text_blocks=1,
|
||||
num_visual_blocks=1,
|
||||
axes_dims=(8, 8, 8),
|
||||
num_visual_blocks=2,
|
||||
axes_dims=(1, 1, 2),
|
||||
visual_cond=False,
|
||||
attention_type="regular",
|
||||
)
|
||||
|
||||
components = {
|
||||
"transformer": transformer.eval(),
|
||||
"vae": vae.eval(),
|
||||
"scheduler": scheduler,
|
||||
"text_encoder": text_encoder.eval(),
|
||||
return {
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2.eval(),
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A cat dancing",
|
||||
"negative_prompt": "blurry, low quality",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 5.0,
|
||||
|
||||
return {
|
||||
"prompt": "a red square",
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"num_frames": 5,
|
||||
"max_sequence_length": 16,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 4.0,
|
||||
"generator": generator,
|
||||
"output_type": "pt",
|
||||
"max_sequence_length": 8,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
video = pipe(**inputs).frames
|
||||
output = pipe(**inputs)
|
||||
video = output.frames[0]
|
||||
|
||||
# Check video shape: (batch, frames, channel, height, width)
|
||||
expected_shape = (1, 5, 3, 32, 32)
|
||||
self.assertEqual(video.shape, expected_shape)
|
||||
self.assertEqual(video.shape, (3, 3, 16, 16))
|
||||
|
||||
# Check specific values
|
||||
expected_slice = torch.tensor(
|
||||
[
|
||||
0.4330,
|
||||
0.4254,
|
||||
0.4285,
|
||||
0.3835,
|
||||
0.4253,
|
||||
0.4196,
|
||||
0.3704,
|
||||
0.3714,
|
||||
0.4999,
|
||||
0.5346,
|
||||
0.4795,
|
||||
0.4637,
|
||||
0.4930,
|
||||
0.5124,
|
||||
0.4902,
|
||||
0.4570,
|
||||
]
|
||||
)
|
||||
|
||||
generated_slice = video.flatten()
|
||||
# Take first 8 and last 8 values for comparison
|
||||
video_slice = torch.cat([generated_slice[:8], generated_slice[-8:]])
|
||||
self.assertTrue(
|
||||
torch.allclose(video_slice, expected_slice, atol=1e-3),
|
||||
f"video_slice: {video_slice}, expected_slice: {expected_slice}",
|
||||
)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
# Override to test batch single identical with video
|
||||
super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=1e-2)
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self, extra_required_param_value_dict=None, atol=1e-3, rtol=1e-3):
|
||||
components = self.get_dummy_components()
|
||||
|
||||
text_component_names = ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]
|
||||
text_components = {k: (v if k in text_component_names else None) for k, v in components.items()}
|
||||
non_text_components = {k: (v if k not in text_component_names else None) for k, v in components.items()}
|
||||
|
||||
pipe_with_just_text_encoder = self.pipeline_class(**text_components)
|
||||
pipe_with_just_text_encoder = pipe_with_just_text_encoder.to(torch_device)
|
||||
|
||||
pipe_without_text_encoders = self.pipeline_class(**non_text_components)
|
||||
pipe_without_text_encoders = pipe_without_text_encoders.to(torch_device)
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
# Compute `encode_prompt()`.
|
||||
|
||||
# Test single prompt
|
||||
prompt = "A cat dancing"
|
||||
with torch.no_grad():
|
||||
prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
|
||||
prompt, device=torch_device, max_sequence_length=16
|
||||
)
|
||||
|
||||
# Check shapes
|
||||
self.assertEqual(prompt_embeds_qwen.shape, (1, 4, 16)) # [batch, seq_len, embed_dim]
|
||||
self.assertEqual(prompt_embeds_clip.shape, (1, 32)) # [batch, embed_dim]
|
||||
self.assertEqual(prompt_cu_seqlens.shape, (2,)) # [batch + 1]
|
||||
|
||||
# Test batch of prompts
|
||||
prompts = ["A cat dancing", "A dog running"]
|
||||
with torch.no_grad():
|
||||
batch_embeds_qwen, batch_embeds_clip, batch_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
|
||||
prompts, device=torch_device, max_sequence_length=16
|
||||
)
|
||||
|
||||
# Check batch size
|
||||
self.assertEqual(batch_embeds_qwen.shape, (len(prompts), 4, 16))
|
||||
self.assertEqual(batch_embeds_clip.shape, (len(prompts), 32))
|
||||
self.assertEqual(len(batch_cu_seqlens), len(prompts) + 1) # [0, len1, len1+len2]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["guidance_scale"] = 1.0
|
||||
|
||||
# baseline output: full pipeline
|
||||
pipe_out = pipe(**inputs).frames
|
||||
|
||||
# test against pipeline call with pre-computed prompt embeds
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["guidance_scale"] = 1.0
|
||||
|
||||
with torch.no_grad():
|
||||
prompt_embeds_qwen, prompt_embeds_clip, prompt_cu_seqlens = pipe_with_just_text_encoder.encode_prompt(
|
||||
inputs["prompt"], device=torch_device, max_sequence_length=inputs["max_sequence_length"]
|
||||
)
|
||||
|
||||
inputs["prompt"] = None
|
||||
inputs["prompt_embeds_qwen"] = prompt_embeds_qwen
|
||||
inputs["prompt_embeds_clip"] = prompt_embeds_clip
|
||||
inputs["prompt_cu_seqlens"] = prompt_cu_seqlens
|
||||
|
||||
pipe_out_2 = pipe_without_text_encoders(**inputs)[0]
|
||||
|
||||
self.assertTrue(
|
||||
torch.allclose(pipe_out, pipe_out_2, atol=atol, rtol=rtol),
|
||||
f"max diff: {torch.max(torch.abs(pipe_out - pipe_out_2))}",
|
||||
)
|
||||
|
||||
@unittest.skip("Kandinsky5T2VPipeline does not support attention slicing")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Kandinsky5T2VPipeline does not support xformers")
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
@unittest.skip("Only SDPA or NABLA (flex)")
|
||||
def test_xformers_memory_efficient_attention(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Kandinsky5T2VPipeline does not support VAE slicing")
|
||||
def test_vae_slicing(self):
|
||||
@unittest.skip("TODO:Test does not work")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit")
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
|
||||
213
tests/pipelines/kandinsky5/test_kandinsky5_i2i.py
Normal file
213
tests/pipelines/kandinsky5/test_kandinsky5_i2i.py
Normal file
@@ -0,0 +1,213 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
Kandinsky5I2IPipeline,
|
||||
Kandinsky5Transformer3DModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class Kandinsky5I2IPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = Kandinsky5I2IPipeline
|
||||
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
params = frozenset(["image", "prompt", "height", "width", "num_inference_steps", "guidance_scale"])
|
||||
|
||||
required_optional_params = {
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
"max_sequence_length",
|
||||
}
|
||||
test_xformers_attention = False
|
||||
supports_optional_components = True
|
||||
supports_dduf = False
|
||||
test_attention_slicing = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
act_fn="silu",
|
||||
block_out_channels=[32, 64, 64, 64],
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
force_upcast=True,
|
||||
in_channels=3,
|
||||
latent_channels=16,
|
||||
layers_per_block=1,
|
||||
mid_block_add_attention=False,
|
||||
norm_num_groups=32,
|
||||
out_channels=3,
|
||||
sample_size=64,
|
||||
scaling_factor=0.3611,
|
||||
shift_factor=0.1159,
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
use_post_quant_conv=False,
|
||||
use_quant_conv=False,
|
||||
)
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
|
||||
qwen_hidden_size = 32
|
||||
torch.manual_seed(0)
|
||||
qwen_config = Qwen2_5_VLConfig(
|
||||
text_config={
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 2,
|
||||
"num_key_value_heads": 2,
|
||||
"rope_scaling": {
|
||||
"mrope_section": [2, 2, 4],
|
||||
"rope_type": "default",
|
||||
"type": "default",
|
||||
},
|
||||
"rope_theta": 1000000.0,
|
||||
},
|
||||
vision_config={
|
||||
"depth": 2,
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_heads": 2,
|
||||
"out_hidden_size": qwen_hidden_size,
|
||||
},
|
||||
hidden_size=qwen_hidden_size,
|
||||
vocab_size=152064,
|
||||
vision_end_token_id=151653,
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
|
||||
tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
clip_hidden_size = 16
|
||||
torch.manual_seed(0)
|
||||
clip_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=clip_hidden_size,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=clip_hidden_size,
|
||||
)
|
||||
text_encoder_2 = CLIPTextModel(clip_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
torch.manual_seed(0)
|
||||
transformer = Kandinsky5Transformer3DModel(
|
||||
in_visual_dim=16,
|
||||
in_text_dim=qwen_hidden_size,
|
||||
in_text_dim2=clip_hidden_size,
|
||||
time_dim=16,
|
||||
out_visual_dim=16,
|
||||
patch_size=(1, 2, 2),
|
||||
model_dim=16,
|
||||
ff_dim=32,
|
||||
num_text_blocks=1,
|
||||
num_visual_blocks=2,
|
||||
axes_dims=(1, 1, 2),
|
||||
visual_cond=True,
|
||||
attention_type="regular",
|
||||
)
|
||||
|
||||
return {
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
image = Image.new("RGB", (64, 64), color="red")
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"prompt": "a red square",
|
||||
"height": 64,
|
||||
"width": 64,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 4.0,
|
||||
"generator": generator,
|
||||
"output_type": "pt",
|
||||
"max_sequence_length": 8,
|
||||
}
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.resolutions = [(64, 64)]
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = pipe(**inputs)
|
||||
image = output.image
|
||||
|
||||
self.assertEqual(image.shape, (1, 3, 64, 64))
|
||||
|
||||
@unittest.skip("TODO: Test does not work")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
|
||||
def test_num_images_per_prompt(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit, Batch isnot yet supported in this pipeline")
|
||||
def test_inference_batch_consistent(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit, not working")
|
||||
def test_float16_inference(self):
|
||||
pass
|
||||
211
tests/pipelines/kandinsky5/test_kandinsky5_i2v.py
Normal file
211
tests/pipelines/kandinsky5/test_kandinsky5_i2v.py
Normal file
@@ -0,0 +1,211 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLHunyuanVideo,
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
Kandinsky5I2VPipeline,
|
||||
Kandinsky5Transformer3DModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class Kandinsky5I2VPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = Kandinsky5I2VPipeline
|
||||
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
params = frozenset(["image", "prompt", "height", "width", "num_frames", "num_inference_steps", "guidance_scale"])
|
||||
|
||||
required_optional_params = {
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
"max_sequence_length",
|
||||
}
|
||||
test_xformers_attention = False
|
||||
supports_optional_components = True
|
||||
supports_dduf = False
|
||||
test_attention_slicing = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKLHunyuanVideo(
|
||||
act_fn="silu",
|
||||
block_out_channels=[32, 64, 64],
|
||||
down_block_types=[
|
||||
"HunyuanVideoDownBlock3D",
|
||||
"HunyuanVideoDownBlock3D",
|
||||
"HunyuanVideoDownBlock3D",
|
||||
],
|
||||
in_channels=3,
|
||||
latent_channels=16,
|
||||
layers_per_block=1,
|
||||
mid_block_add_attention=False,
|
||||
norm_num_groups=32,
|
||||
out_channels=3,
|
||||
scaling_factor=0.476986,
|
||||
spatial_compression_ratio=8,
|
||||
temporal_compression_ratio=4,
|
||||
up_block_types=[
|
||||
"HunyuanVideoUpBlock3D",
|
||||
"HunyuanVideoUpBlock3D",
|
||||
"HunyuanVideoUpBlock3D",
|
||||
],
|
||||
)
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
|
||||
qwen_hidden_size = 32
|
||||
torch.manual_seed(0)
|
||||
qwen_config = Qwen2_5_VLConfig(
|
||||
text_config={
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 2,
|
||||
"num_key_value_heads": 2,
|
||||
"rope_scaling": {
|
||||
"mrope_section": [2, 2, 4],
|
||||
"rope_type": "default",
|
||||
"type": "default",
|
||||
},
|
||||
"rope_theta": 1000000.0,
|
||||
},
|
||||
vision_config={
|
||||
"depth": 2,
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_heads": 2,
|
||||
"out_hidden_size": qwen_hidden_size,
|
||||
},
|
||||
hidden_size=qwen_hidden_size,
|
||||
vocab_size=152064,
|
||||
vision_end_token_id=151653,
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
|
||||
tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
clip_hidden_size = 16
|
||||
torch.manual_seed(0)
|
||||
clip_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=clip_hidden_size,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=clip_hidden_size,
|
||||
)
|
||||
text_encoder_2 = CLIPTextModel(clip_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
torch.manual_seed(0)
|
||||
transformer = Kandinsky5Transformer3DModel(
|
||||
in_visual_dim=16,
|
||||
in_text_dim=qwen_hidden_size,
|
||||
in_text_dim2=clip_hidden_size,
|
||||
time_dim=16,
|
||||
out_visual_dim=16,
|
||||
patch_size=(1, 2, 2),
|
||||
model_dim=16,
|
||||
ff_dim=32,
|
||||
num_text_blocks=1,
|
||||
num_visual_blocks=2,
|
||||
axes_dims=(1, 1, 2),
|
||||
visual_cond=True,
|
||||
attention_type="regular",
|
||||
)
|
||||
|
||||
return {
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
image = Image.new("RGB", (32, 32), color="red")
|
||||
|
||||
return {
|
||||
"image": image,
|
||||
"prompt": "a red square",
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"num_frames": 17,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 4.0,
|
||||
"generator": generator,
|
||||
"output_type": "pt",
|
||||
"max_sequence_length": 8,
|
||||
}
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = pipe(**inputs)
|
||||
video = output.frames[0]
|
||||
|
||||
# 17 frames, RGB, 32×32
|
||||
self.assertEqual(video.shape, (17, 3, 32, 32))
|
||||
|
||||
@unittest.skip("TODO:Test does not work")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit")
|
||||
def test_callback_inputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("TODO: revisit")
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
207
tests/pipelines/kandinsky5/test_kandinsky5_t2i.py
Normal file
207
tests/pipelines/kandinsky5/test_kandinsky5_t2i.py
Normal file
@@ -0,0 +1,207 @@
|
||||
# Copyright 2025 The Kandinsky Team and The HuggingFace Team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
Kandinsky5T2IPipeline,
|
||||
Kandinsky5Transformer3DModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class Kandinsky5T2IPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = Kandinsky5T2IPipeline
|
||||
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
params = frozenset(["prompt", "height", "width", "num_inference_steps", "guidance_scale"])
|
||||
|
||||
required_optional_params = {
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
"max_sequence_length",
|
||||
}
|
||||
test_xformers_attention = False
|
||||
supports_optional_components = True
|
||||
supports_dduf = False
|
||||
test_attention_slicing = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
act_fn="silu",
|
||||
block_out_channels=[32, 64],
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
force_upcast=True,
|
||||
in_channels=3,
|
||||
latent_channels=16,
|
||||
layers_per_block=1,
|
||||
mid_block_add_attention=False,
|
||||
norm_num_groups=32,
|
||||
out_channels=3,
|
||||
sample_size=128,
|
||||
scaling_factor=0.3611,
|
||||
shift_factor=0.1159,
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
use_post_quant_conv=False,
|
||||
use_quant_conv=False,
|
||||
)
|
||||
|
||||
scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
|
||||
|
||||
qwen_hidden_size = 32
|
||||
torch.manual_seed(0)
|
||||
qwen_config = Qwen2_5_VLConfig(
|
||||
text_config={
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_hidden_layers": 2,
|
||||
"num_attention_heads": 2,
|
||||
"num_key_value_heads": 2,
|
||||
"rope_scaling": {
|
||||
"mrope_section": [2, 2, 4],
|
||||
"rope_type": "default",
|
||||
"type": "default",
|
||||
},
|
||||
"rope_theta": 1000000.0,
|
||||
},
|
||||
vision_config={
|
||||
"depth": 2,
|
||||
"hidden_size": qwen_hidden_size,
|
||||
"intermediate_size": qwen_hidden_size,
|
||||
"num_heads": 2,
|
||||
"out_hidden_size": qwen_hidden_size,
|
||||
},
|
||||
hidden_size=qwen_hidden_size,
|
||||
vocab_size=152064,
|
||||
vision_end_token_id=151653,
|
||||
vision_start_token_id=151652,
|
||||
vision_token_id=151654,
|
||||
)
|
||||
text_encoder = Qwen2_5_VLForConditionalGeneration(qwen_config)
|
||||
tokenizer = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration")
|
||||
|
||||
clip_hidden_size = 16
|
||||
torch.manual_seed(0)
|
||||
clip_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=clip_hidden_size,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=clip_hidden_size,
|
||||
)
|
||||
text_encoder_2 = CLIPTextModel(clip_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
torch.manual_seed(0)
|
||||
transformer = Kandinsky5Transformer3DModel(
|
||||
in_visual_dim=16,
|
||||
in_text_dim=qwen_hidden_size,
|
||||
in_text_dim2=clip_hidden_size,
|
||||
time_dim=16,
|
||||
out_visual_dim=16,
|
||||
patch_size=(1, 2, 2),
|
||||
model_dim=16,
|
||||
ff_dim=32,
|
||||
num_text_blocks=1,
|
||||
num_visual_blocks=2,
|
||||
axes_dims=(1, 1, 2),
|
||||
visual_cond=False,
|
||||
attention_type="regular",
|
||||
)
|
||||
|
||||
return {
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
return {
|
||||
"prompt": "a red square",
|
||||
"height": 64,
|
||||
"width": 64,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 4.0,
|
||||
"generator": generator,
|
||||
"output_type": "pt",
|
||||
"max_sequence_length": 8,
|
||||
}
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.resolutions = [(64, 64)]
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = pipe(**inputs)
|
||||
image = output.image
|
||||
|
||||
self.assertEqual(image.shape, (1, 3, 16, 16))
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=5e-3)
|
||||
|
||||
@unittest.skip("Test not supported")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Only SDPA or NABLA (flex)")
|
||||
def test_xformers_memory_efficient_attention(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("All encoders are needed")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Meant for eiter FP32 or BF16 inference")
|
||||
def test_float16_inference(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user