1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00

[tests] remove tests for deprecated pipelines. (#11879)

* remove tests for deprecated pipelines.

* remove folders

* test_pipelines_common
This commit is contained in:
Sayak Paul
2025-07-08 07:13:16 +05:30
committed by GitHub
parent 15d50f16f2
commit bc55b631fd
52 changed files with 0 additions and 10613 deletions

View File

@@ -1,171 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import AmusedPipeline, AmusedScheduler, UVit2DModel, VQModel
from diffusers.utils.testing_utils import (
enable_full_determinism,
require_torch_accelerator,
slow,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = AmusedPipeline
params = TEXT_TO_IMAGE_PARAMS | {"encoder_hidden_states", "negative_encoder_hidden_states"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
encoder_hidden_size=8,
vocab_size=32,
codebook_size=8,
in_channels=8,
block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
block_num_heads=1,
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
scheduler = AmusedScheduler(mask_token_id=31)
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
block_out_channels=[8],
down_block_types=["DownEncoderBlock2D"],
in_channels=3,
latent_channels=8,
layers_per_block=1,
norm_num_groups=8,
num_vq_embeddings=8,
out_channels=3,
sample_size=8,
up_block_types=["UpDecoderBlock2D"],
mid_block_add_attention=False,
lookup_from_codebook=True,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=8,
layer_norm_eps=1e-05,
num_attention_heads=1,
num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"transformer": transformer,
"scheduler": scheduler,
"vqvae": vqvae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"output_type": "np",
"height": 4,
"width": 4,
}
return inputs
def test_inference_batch_consistent(self, batch_sizes=[2]):
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
@unittest.skip("aMUSEd does not support lists of generators")
def test_inference_batch_single_identical(self): ...
@slow
@require_torch_accelerator
class AmusedPipelineSlowTests(unittest.TestCase):
def test_amused_256(self):
pipe = AmusedPipeline.from_pretrained("amused/amused-256")
pipe.to(torch_device)
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.4011, 0.3992, 0.379, 0.3856, 0.3772, 0.3711, 0.3919, 0.385, 0.3625])
assert np.abs(image_slice - expected_slice).max() < 0.003
def test_amused_256_fp16(self):
pipe = AmusedPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
pipe.to(torch_device)
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.0554, 0.05129, 0.0344, 0.0452, 0.0476, 0.0271, 0.0495, 0.0527, 0.0158])
assert np.abs(image_slice - expected_slice).max() < 0.007
def test_amused_512(self):
pipe = AmusedPipeline.from_pretrained("amused/amused-512")
pipe.to(torch_device)
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.1199, 0.1171, 0.1229, 0.1188, 0.1210, 0.1147, 0.1260, 0.1346, 0.1152])
assert np.abs(image_slice - expected_slice).max() < 0.003
def test_amused_512_fp16(self):
pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
pipe.to(torch_device)
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.1509, 0.1492, 0.1531, 0.1485, 0.1501, 0.1465, 0.1581, 0.1690, 0.1499])
assert np.abs(image_slice - expected_slice).max() < 0.003

View File

@@ -1,215 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import AmusedImg2ImgPipeline, AmusedScheduler, UVit2DModel, VQModel
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
enable_full_determinism,
require_torch_accelerator,
slow,
torch_device,
)
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = AmusedImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "latents"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
encoder_hidden_size=8,
vocab_size=32,
codebook_size=8,
in_channels=8,
block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
block_num_heads=1,
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
scheduler = AmusedScheduler(mask_token_id=31)
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
block_out_channels=[8],
down_block_types=["DownEncoderBlock2D"],
in_channels=3,
latent_channels=8,
layers_per_block=1,
norm_num_groups=8,
num_vq_embeddings=32,
out_channels=3,
sample_size=8,
up_block_types=["UpDecoderBlock2D"],
mid_block_add_attention=False,
lookup_from_codebook=True,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=8,
layer_norm_eps=1e-05,
num_attention_heads=1,
num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"transformer": transformer,
"scheduler": scheduler,
"vqvae": vqvae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"output_type": "np",
"image": image,
}
return inputs
def test_inference_batch_consistent(self, batch_sizes=[2]):
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
@unittest.skip("aMUSEd does not support lists of generators")
def test_inference_batch_single_identical(self): ...
@slow
@require_torch_accelerator
class AmusedImg2ImgPipelineSlowTests(unittest.TestCase):
def test_amused_256(self):
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256")
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
.resize((256, 256))
.convert("RGB")
)
image = pipe(
"winter mountains",
image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.9993, 1.0, 0.9996, 1.0, 0.9995, 0.9925, 0.999, 0.9954, 1.0])
assert np.abs(image_slice - expected_slice).max() < 0.01
def test_amused_256_fp16(self):
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256", torch_dtype=torch.float16, variant="fp16")
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
.resize((256, 256))
.convert("RGB")
)
image = pipe(
"winter mountains",
image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.998, 0.998, 0.994, 0.9944, 0.996, 0.9908, 1.0, 1.0, 0.9986])
assert np.abs(image_slice - expected_slice).max() < 0.01
def test_amused_512(self):
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512")
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
.resize((512, 512))
.convert("RGB")
)
image = pipe(
"winter mountains",
image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.2809, 0.1879, 0.2027, 0.2418, 0.1852, 0.2145, 0.2484, 0.2425, 0.2317])
assert np.abs(image_slice - expected_slice).max() < 0.1
def test_amused_512_fp16(self):
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
.resize((512, 512))
.convert("RGB")
)
image = pipe(
"winter mountains",
image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.2795, 0.1867, 0.2028, 0.2450, 0.1856, 0.2140, 0.2473, 0.2406, 0.2313])
assert np.abs(image_slice - expected_slice).max() < 0.1

View File

@@ -1,281 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import AmusedInpaintPipeline, AmusedScheduler, UVit2DModel, VQModel
from diffusers.utils import load_image
from diffusers.utils.testing_utils import (
Expectations,
enable_full_determinism,
require_torch_accelerator,
slow,
torch_device,
)
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = AmusedInpaintPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
def get_dummy_components(self):
torch.manual_seed(0)
transformer = UVit2DModel(
hidden_size=8,
use_bias=False,
hidden_dropout=0.0,
cond_embed_dim=8,
micro_cond_encode_dim=2,
micro_cond_embed_dim=10,
encoder_hidden_size=8,
vocab_size=32,
codebook_size=32,
in_channels=8,
block_out_channels=8,
num_res_blocks=1,
downsample=True,
upsample=True,
block_num_heads=1,
num_hidden_layers=1,
num_attention_heads=1,
attention_dropout=0.0,
intermediate_size=8,
layer_norm_eps=1e-06,
ln_elementwise_affine=True,
)
scheduler = AmusedScheduler(mask_token_id=31)
torch.manual_seed(0)
vqvae = VQModel(
act_fn="silu",
block_out_channels=[8],
down_block_types=["DownEncoderBlock2D"],
in_channels=3,
latent_channels=8,
layers_per_block=1,
norm_num_groups=8,
num_vq_embeddings=32,
out_channels=3,
sample_size=8,
up_block_types=["UpDecoderBlock2D"],
mid_block_add_attention=False,
lookup_from_codebook=True,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=8,
layer_norm_eps=1e-05,
num_attention_heads=1,
num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
projection_dim=8,
)
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"transformer": transformer,
"scheduler": scheduler,
"vqvae": vqvae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
mask_image = torch.full((1, 1, 4, 4), 1.0, dtype=torch.float32, device=device)
mask_image[0, 0, 0, 0] = 0
mask_image[0, 0, 0, 1] = 0
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"output_type": "np",
"image": image,
"mask_image": mask_image,
}
return inputs
def test_inference_batch_consistent(self, batch_sizes=[2]):
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
@unittest.skip("aMUSEd does not support lists of generators")
def test_inference_batch_single_identical(self): ...
@slow
@require_torch_accelerator
class AmusedInpaintPipelineSlowTests(unittest.TestCase):
def test_amused_256(self):
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256")
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
.resize((256, 256))
.convert("RGB")
)
mask_image = (
load_image(
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
)
.resize((256, 256))
.convert("L")
)
image = pipe(
"winter mountains",
image,
mask_image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.0699, 0.0716, 0.0608, 0.0715, 0.0797, 0.0638, 0.0802, 0.0924, 0.0634])
assert np.abs(image_slice - expected_slice).max() < 0.1
def test_amused_256_fp16(self):
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
.resize((256, 256))
.convert("RGB")
)
mask_image = (
load_image(
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
)
.resize((256, 256))
.convert("L")
)
image = pipe(
"winter mountains",
image,
mask_image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 256, 256, 3)
expected_slice = np.array([0.0735, 0.0749, 0.065, 0.0739, 0.0805, 0.0667, 0.0802, 0.0923, 0.0622])
assert np.abs(image_slice - expected_slice).max() < 0.1
def test_amused_512(self):
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512")
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
.resize((512, 512))
.convert("RGB")
)
mask_image = (
load_image(
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
)
.resize((512, 512))
.convert("L")
)
image = pipe(
"winter mountains",
image,
mask_image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005, 0.0])
assert np.abs(image_slice - expected_slice).max() < 0.05
def test_amused_512_fp16(self):
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
pipe.to(torch_device)
image = (
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
.resize((512, 512))
.convert("RGB")
)
mask_image = (
load_image(
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
)
.resize((512, 512))
.convert("L")
)
image = pipe(
"winter mountains",
image,
mask_image,
generator=torch.Generator().manual_seed(0),
num_inference_steps=2,
output_type="np",
).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 512, 3)
expected_slices = Expectations(
{
("xpu", 3): np.array(
[
0.0274,
0.0211,
0.0154,
0.0257,
0.0299,
0.0170,
0.0326,
0.0420,
0.0150,
]
),
("cuda", 7): np.array(
[
0.0227,
0.0157,
0.0098,
0.0213,
0.0250,
0.0127,
0.0280,
0.0380,
0.0095,
]
),
}
)
expected_slice = expected_slices.get_expectation()
assert np.abs(image_slice - expected_slice).max() < 0.003

View File

@@ -1,461 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
import torch.nn.functional as F
from transformers import (
ClapTextConfig,
ClapTextModelWithProjection,
RobertaTokenizer,
SpeechT5HifiGan,
SpeechT5HifiGanConfig,
)
from diffusers import (
AudioLDMPipeline,
AutoencoderKL,
DDIMScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
UNet2DConditionModel,
)
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, nightly, torch_device
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = AudioLDMPipeline
params = TEXT_TO_AUDIO_PARAMS
batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
required_optional_params = frozenset(
[
"num_inference_steps",
"num_waveforms_per_prompt",
"generator",
"latents",
"output_type",
"return_dict",
"callback",
"callback_steps",
]
)
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(8, 16),
layers_per_block=1,
norm_num_groups=8,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=(8, 16),
class_embed_type="simple_projection",
projection_class_embeddings_input_dim=8,
class_embeddings_concat=True,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[8, 16],
in_channels=1,
out_channels=1,
norm_num_groups=8,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_encoder_config = ClapTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=1,
num_hidden_layers=1,
pad_token_id=1,
vocab_size=1000,
projection_dim=8,
)
text_encoder = ClapTextModelWithProjection(text_encoder_config)
tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
vocoder_config = SpeechT5HifiGanConfig(
model_in_dim=8,
sampling_rate=16000,
upsample_initial_channel=16,
upsample_rates=[2, 2],
upsample_kernel_sizes=[4, 4],
resblock_kernel_sizes=[3, 7],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
normalize_before=False,
)
vocoder = SpeechT5HifiGan(vocoder_config)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"vocoder": vocoder,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A hammer hitting a wooden surface",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
}
return inputs
def test_audioldm_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = audioldm_pipe(**inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) == 256
audio_slice = audio[:10]
expected_slice = np.array(
[-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
)
assert np.abs(audio_slice - expected_slice).max() < 1e-2
def test_audioldm_prompt_embeds(self):
components = self.get_dummy_components()
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt"] = 3 * [inputs["prompt"]]
# forward
output = audioldm_pipe(**inputs)
audio_1 = output.audios[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = 3 * [inputs.pop("prompt")]
text_inputs = audioldm_pipe.tokenizer(
prompt,
padding="max_length",
max_length=audioldm_pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_inputs = text_inputs["input_ids"].to(torch_device)
prompt_embeds = audioldm_pipe.text_encoder(
text_inputs,
)
prompt_embeds = prompt_embeds.text_embeds
# additional L_2 normalization over each hidden-state
prompt_embeds = F.normalize(prompt_embeds, dim=-1)
inputs["prompt_embeds"] = prompt_embeds
# forward
output = audioldm_pipe(**inputs)
audio_2 = output.audios[0]
assert np.abs(audio_1 - audio_2).max() < 1e-2
def test_audioldm_negative_prompt_embeds(self):
components = self.get_dummy_components()
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
# forward
output = audioldm_pipe(**inputs)
audio_1 = output.audios[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = 3 * [inputs.pop("prompt")]
embeds = []
for p in [prompt, negative_prompt]:
text_inputs = audioldm_pipe.tokenizer(
p,
padding="max_length",
max_length=audioldm_pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_inputs = text_inputs["input_ids"].to(torch_device)
text_embeds = audioldm_pipe.text_encoder(
text_inputs,
)
text_embeds = text_embeds.text_embeds
# additional L_2 normalization over each hidden-state
text_embeds = F.normalize(text_embeds, dim=-1)
embeds.append(text_embeds)
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
# forward
output = audioldm_pipe(**inputs)
audio_2 = output.audios[0]
assert np.abs(audio_1 - audio_2).max() < 1e-2
def test_audioldm_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
negative_prompt = "egg cracking"
output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) == 256
audio_slice = audio[:10]
expected_slice = np.array(
[-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
)
assert np.abs(audio_slice - expected_slice).max() < 1e-2
def test_audioldm_num_waveforms_per_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(device)
audioldm_pipe.set_progress_bar_config(disable=None)
prompt = "A hammer hitting a wooden surface"
# test num_waveforms_per_prompt=1 (default)
audios = audioldm_pipe(prompt, num_inference_steps=2).audios
assert audios.shape == (1, 256)
# test num_waveforms_per_prompt=1 (default) for batch of prompts
batch_size = 2
audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
assert audios.shape == (batch_size, 256)
# test num_waveforms_per_prompt for single prompt
num_waveforms_per_prompt = 2
audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
assert audios.shape == (num_waveforms_per_prompt, 256)
# test num_waveforms_per_prompt for batch of prompts
batch_size = 2
audios = audioldm_pipe(
[prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
).audios
assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
def test_audioldm_audio_length_in_s(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
inputs = self.get_dummy_inputs(device)
output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) / vocoder_sampling_rate == 0.016
output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) / vocoder_sampling_rate == 0.032
def test_audioldm_vocoder_model_in_dim(self):
components = self.get_dummy_components()
audioldm_pipe = AudioLDMPipeline(**components)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
prompt = ["hey"]
output = audioldm_pipe(prompt, num_inference_steps=1)
audio_shape = output.audios.shape
assert audio_shape == (1, 256)
config = audioldm_pipe.vocoder.config
config.model_in_dim *= 2
audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
output = audioldm_pipe(prompt, num_inference_steps=1)
audio_shape = output.audios.shape
# waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
assert audio_shape == (1, 256)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical()
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
@nightly
class AudioLDMPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
inputs = {
"prompt": "A hammer hitting a wooden surface",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 2.5,
}
return inputs
def test_audioldm(self):
audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 25
audio = audioldm_pipe(**inputs).audios[0]
assert audio.ndim == 1
assert len(audio) == 81920
audio_slice = audio[77230:77240]
expected_slice = np.array(
[-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
)
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 1e-2
@nightly
class AudioLDMPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
inputs = {
"prompt": "A hammer hitting a wooden surface",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 2.5,
}
return inputs
def test_audioldm_lms(self):
audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
audioldm_pipe = audioldm_pipe.to(torch_device)
audioldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
audio = audioldm_pipe(**inputs).audios[0]
assert audio.ndim == 1
assert len(audio) == 81920
audio_slice = audio[27780:27790]
expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 3e-2

View File

@@ -1,204 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import CLIPTokenizer
from transformers.models.blip_2.configuration_blip_2 import Blip2Config
from transformers.models.clip.configuration_clip import CLIPTextConfig
from diffusers import AutoencoderKL, BlipDiffusionPipeline, PNDMScheduler, UNet2DConditionModel
from diffusers.utils.testing_utils import enable_full_determinism
from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = BlipDiffusionPipeline
params = [
"prompt",
"reference_image",
"source_subject_category",
"target_subject_category",
]
batch_params = [
"prompt",
"reference_image",
"source_subject_category",
"target_subject_category",
]
required_optional_params = [
"generator",
"height",
"width",
"latents",
"guidance_scale",
"num_inference_steps",
"neg_prompt",
"guidance_scale",
"prompt_strength",
"prompt_reps",
]
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
vocab_size=1000,
hidden_size=8,
intermediate_size=8,
projection_dim=8,
num_hidden_layers=1,
num_attention_heads=1,
max_position_embeddings=77,
)
text_encoder = ContextCLIPTextModel(text_encoder_config)
vae = AutoencoderKL(
in_channels=4,
out_channels=4,
down_block_types=("DownEncoderBlock2D",),
up_block_types=("UpDecoderBlock2D",),
block_out_channels=(8,),
norm_num_groups=8,
layers_per_block=1,
act_fn="silu",
latent_channels=4,
sample_size=8,
)
blip_vision_config = {
"hidden_size": 8,
"intermediate_size": 8,
"num_hidden_layers": 1,
"num_attention_heads": 1,
"image_size": 224,
"patch_size": 14,
"hidden_act": "quick_gelu",
}
blip_qformer_config = {
"vocab_size": 1000,
"hidden_size": 8,
"num_hidden_layers": 1,
"num_attention_heads": 1,
"intermediate_size": 8,
"max_position_embeddings": 512,
"cross_attention_frequency": 1,
"encoder_hidden_size": 8,
}
qformer_config = Blip2Config(
vision_config=blip_vision_config,
qformer_config=blip_qformer_config,
num_query_tokens=8,
tokenizer="hf-internal-testing/tiny-random-bert",
)
qformer = Blip2QFormerModel(qformer_config)
unet = UNet2DConditionModel(
block_out_channels=(8, 16),
norm_num_groups=8,
layers_per_block=1,
sample_size=16,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=8,
)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
scheduler = PNDMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
set_alpha_to_one=False,
skip_prk_steps=True,
)
vae.eval()
qformer.eval()
text_encoder.eval()
image_processor = BlipImageProcessor()
components = {
"text_encoder": text_encoder,
"vae": vae,
"qformer": qformer,
"unet": unet,
"tokenizer": tokenizer,
"scheduler": scheduler,
"image_processor": image_processor,
}
return components
def get_dummy_inputs(self, device, seed=0):
np.random.seed(seed)
reference_image = np.random.rand(32, 32, 3) * 255
reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "swimming underwater",
"generator": generator,
"reference_image": reference_image,
"source_subject_category": "dog",
"target_subject_category": "dog",
"height": 32,
"width": 32,
"guidance_scale": 7.5,
"num_inference_steps": 2,
"output_type": "np",
}
return inputs
def test_blipdiffusion(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
image = pipe(**self.get_dummy_inputs(device))[0]
image_slice = image[0, -3:, -3:, 0]
assert image.shape == (1, 16, 16, 4)
expected_slice = np.array(
[0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
f" expected_slice {image_slice.flatten()}, but got {image_slice.flatten()}"
)
@unittest.skip("Test not supported because of complexities in deriving query_embeds.")
def test_encode_prompt_works_in_isolation(self):
pass

View File

@@ -1,228 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import CLIPTokenizer
from transformers.models.blip_2.configuration_blip_2 import Blip2Config
from transformers.models.clip.configuration_clip import CLIPTextConfig
from diffusers import (
AutoencoderKL,
BlipDiffusionControlNetPipeline,
ControlNetModel,
PNDMScheduler,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = BlipDiffusionControlNetPipeline
params = [
"prompt",
"reference_image",
"source_subject_category",
"target_subject_category",
"condtioning_image",
]
batch_params = [
"prompt",
"reference_image",
"source_subject_category",
"target_subject_category",
"condtioning_image",
]
required_optional_params = [
"generator",
"height",
"width",
"latents",
"guidance_scale",
"num_inference_steps",
"neg_prompt",
"guidance_scale",
"prompt_strength",
"prompt_reps",
]
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
vocab_size=1000,
hidden_size=16,
intermediate_size=16,
projection_dim=16,
num_hidden_layers=1,
num_attention_heads=1,
max_position_embeddings=77,
)
text_encoder = ContextCLIPTextModel(text_encoder_config)
vae = AutoencoderKL(
in_channels=4,
out_channels=4,
down_block_types=("DownEncoderBlock2D",),
up_block_types=("UpDecoderBlock2D",),
block_out_channels=(32,),
layers_per_block=1,
act_fn="silu",
latent_channels=4,
norm_num_groups=16,
sample_size=16,
)
blip_vision_config = {
"hidden_size": 16,
"intermediate_size": 16,
"num_hidden_layers": 1,
"num_attention_heads": 1,
"image_size": 224,
"patch_size": 14,
"hidden_act": "quick_gelu",
}
blip_qformer_config = {
"vocab_size": 1000,
"hidden_size": 16,
"num_hidden_layers": 1,
"num_attention_heads": 1,
"intermediate_size": 16,
"max_position_embeddings": 512,
"cross_attention_frequency": 1,
"encoder_hidden_size": 16,
}
qformer_config = Blip2Config(
vision_config=blip_vision_config,
qformer_config=blip_qformer_config,
num_query_tokens=16,
tokenizer="hf-internal-testing/tiny-random-bert",
)
qformer = Blip2QFormerModel(qformer_config)
unet = UNet2DConditionModel(
block_out_channels=(4, 16),
layers_per_block=1,
norm_num_groups=4,
sample_size=16,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=16,
)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
scheduler = PNDMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
set_alpha_to_one=False,
skip_prk_steps=True,
)
controlnet = ControlNetModel(
block_out_channels=(4, 16),
layers_per_block=1,
in_channels=4,
norm_num_groups=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
cross_attention_dim=16,
conditioning_embedding_out_channels=(8, 16),
)
vae.eval()
qformer.eval()
text_encoder.eval()
image_processor = BlipImageProcessor()
components = {
"text_encoder": text_encoder,
"vae": vae,
"qformer": qformer,
"unet": unet,
"tokenizer": tokenizer,
"scheduler": scheduler,
"controlnet": controlnet,
"image_processor": image_processor,
}
return components
def get_dummy_inputs(self, device, seed=0):
np.random.seed(seed)
reference_image = np.random.rand(32, 32, 3) * 255
reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
cond_image = np.random.rand(32, 32, 3) * 255
cond_image = Image.fromarray(cond_image.astype("uint8")).convert("RGBA")
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "swimming underwater",
"generator": generator,
"reference_image": reference_image,
"condtioning_image": cond_image,
"source_subject_category": "dog",
"target_subject_category": "dog",
"height": 32,
"width": 32,
"guidance_scale": 7.5,
"num_inference_steps": 2,
"output_type": "np",
}
return inputs
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581])
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
def test_blipdiffusion_controlnet(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
image = pipe(**self.get_dummy_inputs(device))[0]
image_slice = image[0, -3:, -3:, 0]
assert image.shape == (1, 16, 16, 4)
expected_slice = np.array([0.7953, 0.7136, 0.6597, 0.4779, 0.7389, 0.4111, 0.5826, 0.4150, 0.8422])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
)
@unittest.skip("Test not supported because of complexities in deriving query_embeds.")
def test_encode_prompt_works_in_isolation(self):
pass

View File

@@ -1,352 +0,0 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AsymmetricAutoencoderKL,
AutoencoderKL,
AutoencoderTiny,
ConsistencyDecoderVAE,
ControlNetXSAdapter,
DDIMScheduler,
LCMScheduler,
StableDiffusionControlNetXSPipeline,
UNet2DConditionModel,
)
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
load_image,
require_accelerator,
require_torch_accelerator,
slow,
torch_device,
)
from diffusers.utils.torch_utils import randn_tensor
from ...models.autoencoders.vae import (
get_asym_autoencoder_kl_config,
get_autoencoder_kl_config,
get_autoencoder_tiny_config,
get_consistency_vae_config,
)
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_BATCH_PARAMS,
TEXT_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_PARAMS,
)
from ..test_pipelines_common import (
PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
SDFunctionTesterMixin,
)
enable_full_determinism()
def to_np(tensor):
if isinstance(tensor, torch.Tensor):
tensor = tensor.detach().cpu().numpy()
return tensor
class ControlNetXSPipelineFastTests(
PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin,
SDFunctionTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionControlNetXSPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
test_attention_slicing = False
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self, time_cond_proj_dim=None):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
layers_per_block=2,
sample_size=16,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=8,
norm_num_groups=4,
time_cond_proj_dim=time_cond_proj_dim,
use_linear_projection=True,
)
torch.manual_seed(0)
controlnet = ControlNetXSAdapter.from_unet(
unet=unet,
size_ratio=1,
learn_time_embedding=True,
conditioning_embedding_out_channels=(2, 2),
)
torch.manual_seed(0)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[4, 8],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"controlnet": controlnet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
controlnet_embedder_scale_factor = 2
image = randn_tensor(
(1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
generator=generator,
device=torch.device(device),
)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "numpy",
"image": image,
}
return inputs
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
def test_controlnet_lcm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components(time_cond_proj_dim=8)
sd_pipe = StableDiffusionControlNetXSPipeline(**components)
sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = sd_pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 16, 16, 3)
expected_slice = np.array([0.745, 0.753, 0.767, 0.543, 0.523, 0.502, 0.314, 0.521, 0.478])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_to_dtype(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
pipe.to(dtype=torch.float16)
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
def test_multi_vae(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
block_out_channels = pipe.vae.config.block_out_channels
norm_num_groups = pipe.vae.config.norm_num_groups
vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
configs = [
get_autoencoder_kl_config(block_out_channels, norm_num_groups),
get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
get_consistency_vae_config(block_out_channels, norm_num_groups),
get_autoencoder_tiny_config(block_out_channels),
]
out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
for vae_cls, config in zip(vae_classes, configs):
vae = vae_cls(**config)
vae = vae.to(torch_device)
components["vae"] = vae
vae_pipe = self.pipeline_class(**components)
# pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
# So we need to move the new pipe to device.
vae_pipe.to(torch_device)
vae_pipe.set_progress_bar_config(disable=None)
out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
assert out_vae_np.shape == out_np.shape
@require_accelerator
def test_to_device(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to("cpu")
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the device from pipe.components
model_devices = [
component.device.type for component in pipe.components.values() if hasattr(component, "device")
]
self.assertTrue(all(device == "cpu" for device in model_devices))
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
self.assertTrue(np.isnan(output_cpu).sum() == 0)
pipe.to(torch_device)
model_devices = [
component.device.type for component in pipe.components.values() if hasattr(component, "device")
]
self.assertTrue(all(device == torch_device for device in model_devices))
output_device = pipe(**self.get_dummy_inputs(torch_device))[0]
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@slow
@require_torch_accelerator
class ControlNetXSPipelineSlowTests(unittest.TestCase):
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_canny(self):
controlnet = ControlNetXSAdapter.from_pretrained(
"UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "bird"
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
)
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
image = output.images[0]
assert image.shape == (768, 512, 3)
original_image = image[-3:, -3:, -1].flatten()
expected_image = np.array([0.1963, 0.229, 0.2659, 0.2109, 0.2332, 0.2827, 0.2534, 0.2422, 0.2808])
assert np.allclose(original_image, expected_image, atol=1e-04)
def test_depth(self):
controlnet = ControlNetXSAdapter.from_pretrained(
"UmerHA/Testing-ConrolNetXS-SD2.1-depth", torch_dtype=torch.float16
)
pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "Stormtrooper's lecture"
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
)
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
image = output.images[0]
assert image.shape == (512, 512, 3)
original_image = image[-3:, -3:, -1].flatten()
expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
assert np.allclose(original_image, expected_image, atol=1e-04)

View File

@@ -1,393 +0,0 @@
# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import (
AsymmetricAutoencoderKL,
AutoencoderKL,
AutoencoderTiny,
ConsistencyDecoderVAE,
ControlNetXSAdapter,
EulerDiscreteScheduler,
StableDiffusionXLControlNetXSPipeline,
UNet2DConditionModel,
)
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
load_image,
require_torch_accelerator,
slow,
torch_device,
)
from diffusers.utils.torch_utils import randn_tensor
from ...models.autoencoders.vae import (
get_asym_autoencoder_kl_config,
get_autoencoder_kl_config,
get_autoencoder_tiny_config,
get_consistency_vae_config,
)
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_BATCH_PARAMS,
TEXT_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_PARAMS,
)
from ..test_pipelines_common import (
PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
enable_full_determinism()
class StableDiffusionXLControlNetXSPipelineFastTests(
PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionXLControlNetXSPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
test_attention_slicing = False
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
layers_per_block=2,
sample_size=16,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
use_linear_projection=True,
norm_num_groups=4,
# SD2-specific config below
attention_head_dim=(2, 4),
addition_embed_type="text_time",
addition_time_embed_dim=8,
transformer_layers_per_block=(1, 2),
projection_class_embeddings_input_dim=56, # 6 * 8 (addition_time_embed_dim) + 8 (cross_attention_dim)
cross_attention_dim=8,
)
torch.manual_seed(0)
controlnet = ControlNetXSAdapter.from_unet(
unet=unet,
size_ratio=0.5,
learn_time_embedding=True,
conditioning_embedding_out_channels=(2, 2),
)
torch.manual_seed(0)
scheduler = EulerDiscreteScheduler(
beta_start=0.00085,
beta_end=0.012,
steps_offset=1,
beta_schedule="scaled_linear",
timestep_spacing="leading",
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[4, 8],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=4,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
# SD2-specific config below
hidden_act="gelu",
projection_dim=8,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"controlnet": controlnet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"text_encoder_2": text_encoder_2,
"tokenizer_2": tokenizer_2,
"feature_extractor": None,
}
return components
# Copied from test_controlnet_sdxl.py
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
controlnet_embedder_scale_factor = 2
image = randn_tensor(
(1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
generator=generator,
device=torch.device(device),
)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
"image": image,
}
return inputs
# Copied from test_controlnet_sdxl.py
def test_attention_slicing_forward_pass(self):
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
# Copied from test_controlnet_sdxl.py
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
# Copied from test_controlnet_sdxl.py
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
@unittest.skip("We test this functionality elsewhere already.")
def test_save_load_optional_components(self):
pass
@require_torch_accelerator
# Copied from test_controlnet_sdxl.py
def test_stable_diffusion_xl_offloads(self):
pipes = []
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components).to(torch_device)
pipes.append(sd_pipe)
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe)
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe)
image_slices = []
for pipe in pipes:
pipe.unet.set_default_attn_processor()
inputs = self.get_dummy_inputs(torch_device)
image = pipe(**inputs).images
image_slices.append(image[0, -3:, -3:, -1].flatten())
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
# Copied from test_controlnet_sdxl.py
def test_stable_diffusion_xl_multi_prompts(self):
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components).to(torch_device)
# forward with single prompt
inputs = self.get_dummy_inputs(torch_device)
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with same prompt duplicated
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt_2"] = inputs["prompt"]
output = sd_pipe(**inputs)
image_slice_2 = output.images[0, -3:, -3:, -1]
# ensure the results are equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
# forward with different prompt
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt_2"] = "different prompt"
output = sd_pipe(**inputs)
image_slice_3 = output.images[0, -3:, -3:, -1]
# ensure the results are not equal
assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
# manually set a negative_prompt
inputs = self.get_dummy_inputs(torch_device)
inputs["negative_prompt"] = "negative prompt"
output = sd_pipe(**inputs)
image_slice_1 = output.images[0, -3:, -3:, -1]
# forward with same negative_prompt duplicated
inputs = self.get_dummy_inputs(torch_device)
inputs["negative_prompt"] = "negative prompt"
inputs["negative_prompt_2"] = inputs["negative_prompt"]
output = sd_pipe(**inputs)
image_slice_2 = output.images[0, -3:, -3:, -1]
# ensure the results are equal
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
# forward with different negative_prompt
inputs = self.get_dummy_inputs(torch_device)
inputs["negative_prompt"] = "negative prompt"
inputs["negative_prompt_2"] = "different negative prompt"
output = sd_pipe(**inputs)
image_slice_3 = output.images[0, -3:, -3:, -1]
# ensure the results are not equal
assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
# Copied from test_controlnetxs.py
def test_to_dtype(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
pipe.to(dtype=torch.float16)
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
def test_multi_vae(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
block_out_channels = pipe.vae.config.block_out_channels
norm_num_groups = pipe.vae.config.norm_num_groups
vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
configs = [
get_autoencoder_kl_config(block_out_channels, norm_num_groups),
get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
get_consistency_vae_config(block_out_channels, norm_num_groups),
get_autoencoder_tiny_config(block_out_channels),
]
out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
for vae_cls, config in zip(vae_classes, configs):
vae = vae_cls(**config)
vae = vae.to(torch_device)
components["vae"] = vae
vae_pipe = self.pipeline_class(**components)
# pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
# So we need to move the new pipe to device.
vae_pipe.to(torch_device)
vae_pipe.set_progress_bar_config(disable=None)
out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
assert out_vae_np.shape == out_np.shape
@slow
@require_torch_accelerator
class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_canny(self):
controlnet = ControlNetXSAdapter.from_pretrained(
"UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
)
pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_sequential_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "bird"
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
)
images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
assert images[0].shape == (768, 512, 3)
original_image = images[0, -3:, -3:, -1].flatten()
expected_image = np.array([0.3202, 0.3151, 0.3328, 0.3172, 0.337, 0.3381, 0.3378, 0.3389, 0.3224])
assert np.allclose(original_image, expected_image, atol=1e-04)
def test_depth(self):
controlnet = ControlNetXSAdapter.from_pretrained(
"UmerHA/Testing-ConrolNetXS-SDXL-depth", torch_dtype=torch.float16
)
pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.enable_sequential_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "Stormtrooper's lecture"
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
)
images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
assert images[0].shape == (512, 512, 3)
original_image = images[0, -3:, -3:, -1].flatten()
expected_image = np.array([0.5448, 0.5437, 0.5426, 0.5543, 0.553, 0.5475, 0.5595, 0.5602, 0.5529])
assert np.allclose(original_image, expected_image, atol=1e-04)

View File

@@ -1,174 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
skip_mps,
torch_device,
)
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = DanceDiffusionPipeline
params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
required_optional_params = PipelineTesterMixin.required_optional_params - {
"callback",
"latents",
"callback_steps",
"output_type",
"num_images_per_prompt",
}
batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
test_attention_slicing = False
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet1DModel(
block_out_channels=(32, 32, 64),
extra_in_channels=16,
sample_size=512,
sample_rate=16_000,
in_channels=2,
out_channels=2,
flip_sin_to_cos=True,
use_timestep_embedding=False,
time_embedding_type="fourier",
mid_block_type="UNetMidBlock1D",
down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
)
scheduler = IPNDMScheduler()
components = {
"unet": unet,
"scheduler": scheduler,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"batch_size": 1,
"generator": generator,
"num_inference_steps": 4,
}
return inputs
def test_dance_diffusion(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = DanceDiffusionPipeline(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = pipe(**inputs)
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, components["unet"].sample_size)
expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
@skip_mps
def test_save_load_local(self):
return super().test_save_load_local()
@skip_mps
def test_dict_tuple_outputs_equivalent(self):
return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
@skip_mps
def test_save_load_optional_components(self):
return super().test_save_load_optional_components()
@skip_mps
def test_attention_slicing_forward_pass(self):
return super().test_attention_slicing_forward_pass()
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@nightly
@require_torch_accelerator
class PipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_dance_diffusion(self):
device = torch_device
pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
generator = torch.manual_seed(0)
output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, pipe.unet.config.sample_size)
expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
def test_dance_diffusion_fp16(self):
device = torch_device
pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
generator = torch.manual_seed(0)
output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
audio = output.audios
audio_slice = audio[0, -3:, -3:]
assert audio.shape == (1, 2, pipe.unet.config.sample_size)
expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2

View File

@@ -1,283 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import unittest
import numpy as np
import pytest
import torch
from transformers import (
CLIPImageProcessor,
CLIPTextConfig,
CLIPTextModel,
CLIPTokenizer,
CLIPVisionConfig,
CLIPVisionModelWithProjection,
)
from diffusers import (
AutoencoderKL,
DDIMScheduler,
I2VGenXLPipeline,
)
from diffusers.models.unets import I2VGenXLUNet
from diffusers.utils import is_xformers_available, load_image
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
is_torch_version,
numpy_cosine_similarity_distance,
require_torch_accelerator,
skip_mps,
slow,
torch_device,
)
from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
enable_full_determinism()
@skip_mps
class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = I2VGenXLPipeline
params = frozenset(["prompt", "negative_prompt", "image"])
batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"])
# No `output_type`.
required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])
supports_dduf = False
test_layerwise_casting = True
def get_dummy_components(self):
torch.manual_seed(0)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
unet = I2VGenXLUNet(
block_out_channels=(4, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
cross_attention_dim=4,
attention_head_dim=4,
num_attention_heads=None,
norm_num_groups=2,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=(8,),
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D"],
latent_channels=4,
sample_size=32,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=4,
intermediate_size=16,
layer_norm_eps=1e-05,
num_attention_heads=2,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
projection_dim=32,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
torch.manual_seed(0)
vision_encoder_config = CLIPVisionConfig(
hidden_size=4,
projection_dim=4,
num_hidden_layers=2,
num_attention_heads=2,
image_size=32,
intermediate_size=16,
patch_size=1,
)
image_encoder = CLIPVisionModelWithProjection(vision_encoder_config)
torch.manual_seed(0)
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"image_encoder": image_encoder,
"tokenizer": tokenizer,
"feature_extractor": feature_extractor,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"image": input_image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "pt",
"num_frames": 4,
"width": 32,
"height": 32,
}
return inputs
def test_text_to_video_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["output_type"] = "np"
frames = pipe(**inputs).frames
image_slice = frames[0][0][-3:, -3:, -1]
assert frames[0][0].shape == (32, 32, 3)
expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@pytest.mark.xfail(
condition=is_torch_version(">=", "2.7"),
reason="Test currently fails on PyTorch 2.7.",
strict=False,
)
def test_save_load_local(self):
super().test_save_load_local(expected_max_difference=0.006)
def test_sequential_cpu_offload_forward_pass(self):
super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008)
def test_dict_tuple_outputs_equivalent(self):
super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.009)
def test_save_load_optional_components(self):
super().test_save_load_optional_components(expected_max_difference=0.008)
@unittest.skip("Deprecated functionality")
def test_attention_slicing_forward_pass(self):
pass
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=0.008)
def test_model_cpu_offload_forward_pass(self):
super().test_model_cpu_offload_forward_pass(expected_max_diff=0.008)
def test_num_videos_per_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["output_type"] = "np"
frames = pipe(**inputs, num_videos_per_prompt=2).frames
assert frames.shape == (2, 4, 32, 32, 3)
assert frames[0][0].shape == (32, 32, 3)
image_slice = frames[0][0][-3:, -3:, -1]
expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@unittest.skip("Test not supported for now.")
def test_encode_prompt_works_in_isolation(self):
pass
@slow
@require_torch_accelerator
class I2VGenXLPipelineSlowTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_i2vgen_xl(self):
pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
)
generator = torch.Generator("cpu").manual_seed(0)
num_frames = 3
output = pipe(
image=image,
prompt="my cat",
num_frames=num_frames,
generator=generator,
num_inference_steps=3,
output_type="np",
)
image = output.frames[0]
assert image.shape == (num_frames, 704, 1280, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892])
assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3

View File

@@ -1,478 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import (
ClapAudioConfig,
ClapConfig,
ClapFeatureExtractor,
ClapModel,
ClapTextConfig,
RobertaTokenizer,
SpeechT5HifiGan,
SpeechT5HifiGanConfig,
)
from diffusers import (
AutoencoderKL,
DDIMScheduler,
LMSDiscreteScheduler,
MusicLDMPipeline,
PNDMScheduler,
UNet2DConditionModel,
)
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = MusicLDMPipeline
params = TEXT_TO_AUDIO_PARAMS
batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
required_optional_params = frozenset(
[
"num_inference_steps",
"num_waveforms_per_prompt",
"generator",
"latents",
"output_type",
"return_dict",
"callback",
"callback_steps",
]
)
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=(32, 64),
class_embed_type="simple_projection",
projection_class_embeddings_input_dim=32,
class_embeddings_concat=True,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=1,
out_channels=1,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_branch_config = ClapTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=16,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=2,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
)
audio_branch_config = ClapAudioConfig(
spec_size=64,
window_size=4,
num_mel_bins=64,
intermediate_size=37,
layer_norm_eps=1e-05,
depths=[2, 2],
num_attention_heads=[2, 2],
num_hidden_layers=2,
hidden_size=192,
patch_size=2,
patch_stride=2,
patch_embed_input_channels=4,
)
text_encoder_config = ClapConfig.from_text_audio_configs(
text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=32
)
text_encoder = ClapModel(text_encoder_config)
tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
feature_extractor = ClapFeatureExtractor.from_pretrained(
"hf-internal-testing/tiny-random-ClapModel", hop_length=7900
)
torch.manual_seed(0)
vocoder_config = SpeechT5HifiGanConfig(
model_in_dim=8,
sampling_rate=16000,
upsample_initial_channel=16,
upsample_rates=[2, 2],
upsample_kernel_sizes=[4, 4],
resblock_kernel_sizes=[3, 7],
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
normalize_before=False,
)
vocoder = SpeechT5HifiGan(vocoder_config)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"feature_extractor": feature_extractor,
"vocoder": vocoder,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A hammer hitting a wooden surface",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
}
return inputs
def test_musicldm_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = musicldm_pipe(**inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) == 256
audio_slice = audio[:10]
expected_slice = np.array(
[-0.0027, -0.0036, -0.0037, -0.0020, -0.0035, -0.0019, -0.0037, -0.0020, -0.0038, -0.0019]
)
assert np.abs(audio_slice - expected_slice).max() < 1e-4
def test_musicldm_prompt_embeds(self):
components = self.get_dummy_components()
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt"] = 3 * [inputs["prompt"]]
# forward
output = musicldm_pipe(**inputs)
audio_1 = output.audios[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = 3 * [inputs.pop("prompt")]
text_inputs = musicldm_pipe.tokenizer(
prompt,
padding="max_length",
max_length=musicldm_pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_inputs = text_inputs["input_ids"].to(torch_device)
prompt_embeds = musicldm_pipe.text_encoder.get_text_features(text_inputs)
inputs["prompt_embeds"] = prompt_embeds
# forward
output = musicldm_pipe(**inputs)
audio_2 = output.audios[0]
assert np.abs(audio_1 - audio_2).max() < 1e-2
def test_musicldm_negative_prompt_embeds(self):
components = self.get_dummy_components()
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
negative_prompt = 3 * ["this is a negative prompt"]
inputs["negative_prompt"] = negative_prompt
inputs["prompt"] = 3 * [inputs["prompt"]]
# forward
output = musicldm_pipe(**inputs)
audio_1 = output.audios[0]
inputs = self.get_dummy_inputs(torch_device)
prompt = 3 * [inputs.pop("prompt")]
embeds = []
for p in [prompt, negative_prompt]:
text_inputs = musicldm_pipe.tokenizer(
p,
padding="max_length",
max_length=musicldm_pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_inputs = text_inputs["input_ids"].to(torch_device)
text_embeds = musicldm_pipe.text_encoder.get_text_features(
text_inputs,
)
embeds.append(text_embeds)
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
# forward
output = musicldm_pipe(**inputs)
audio_2 = output.audios[0]
assert np.abs(audio_1 - audio_2).max() < 1e-2
def test_musicldm_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
negative_prompt = "egg cracking"
output = musicldm_pipe(**inputs, negative_prompt=negative_prompt)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) == 256
audio_slice = audio[:10]
expected_slice = np.array(
[-0.0027, -0.0036, -0.0037, -0.0019, -0.0035, -0.0018, -0.0037, -0.0021, -0.0038, -0.0018]
)
assert np.abs(audio_slice - expected_slice).max() < 1e-4
def test_musicldm_num_waveforms_per_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(device)
musicldm_pipe.set_progress_bar_config(disable=None)
prompt = "A hammer hitting a wooden surface"
# test num_waveforms_per_prompt=1 (default)
audios = musicldm_pipe(prompt, num_inference_steps=2).audios
assert audios.shape == (1, 256)
# test num_waveforms_per_prompt=1 (default) for batch of prompts
batch_size = 2
audios = musicldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
assert audios.shape == (batch_size, 256)
# test num_waveforms_per_prompt for single prompt
num_waveforms_per_prompt = 2
audios = musicldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
assert audios.shape == (num_waveforms_per_prompt, 256)
# test num_waveforms_per_prompt for batch of prompts
batch_size = 2
audios = musicldm_pipe(
[prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
).audios
assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
def test_musicldm_audio_length_in_s(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
vocoder_sampling_rate = musicldm_pipe.vocoder.config.sampling_rate
inputs = self.get_dummy_inputs(device)
output = musicldm_pipe(audio_length_in_s=0.016, **inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) / vocoder_sampling_rate == 0.016
output = musicldm_pipe(audio_length_in_s=0.032, **inputs)
audio = output.audios[0]
assert audio.ndim == 1
assert len(audio) / vocoder_sampling_rate == 0.032
def test_musicldm_vocoder_model_in_dim(self):
components = self.get_dummy_components()
musicldm_pipe = MusicLDMPipeline(**components)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
prompt = ["hey"]
output = musicldm_pipe(prompt, num_inference_steps=1)
audio_shape = output.audios.shape
assert audio_shape == (1, 256)
config = musicldm_pipe.vocoder.config
config.model_in_dim *= 2
musicldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
output = musicldm_pipe(prompt, num_inference_steps=1)
audio_shape = output.audios.shape
# waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
assert audio_shape == (1, 256)
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical()
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
def test_to_dtype(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
# The method component.dtype returns the dtype of the first parameter registered in the model, not the
# dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
# Without the logit scale parameters, everything is float32
model_dtypes.pop("text_encoder")
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
# the CLAP sub-models are float32
model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
# Once we send to fp16, all params are in half-precision, including the logit scale
pipe.to(dtype=torch.float16)
model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
@nightly
@require_torch_accelerator
class MusicLDMPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
inputs = {
"prompt": "A hammer hitting a wooden surface",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 2.5,
}
return inputs
def test_musicldm(self):
musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 25
audio = musicldm_pipe(**inputs).audios[0]
assert audio.ndim == 1
assert len(audio) == 81952
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
audio_slice = audio[8680:8690]
expected_slice = np.array(
[-0.1042, -0.1068, -0.1235, -0.1387, -0.1428, -0.136, -0.1213, -0.1097, -0.0967, -0.0945]
)
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 1e-3
def test_musicldm_lms(self):
musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
musicldm_pipe.scheduler = LMSDiscreteScheduler.from_config(musicldm_pipe.scheduler.config)
musicldm_pipe = musicldm_pipe.to(torch_device)
musicldm_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
audio = musicldm_pipe(**inputs).audios[0]
assert audio.ndim == 1
assert len(audio) == 81952
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
audio_slice = audio[58020:58030]
expected_slice = np.array([0.3592, 0.3477, 0.4084, 0.4665, 0.5048, 0.5891, 0.6461, 0.5579, 0.4595, 0.4403])
max_diff = np.abs(expected_slice - audio_slice).max()
assert max_diff < 1e-3

View File

@@ -1,229 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import CLIPImageProcessor, CLIPVisionConfig
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
nightly,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = PaintByExamplePipeline
params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=9,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
scheduler = PNDMScheduler(skip_prk_steps=True)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
config = CLIPVisionConfig(
hidden_size=32,
projection_dim=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
image_size=32,
patch_size=4,
)
image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"image_encoder": image_encoder,
"safety_checker": None,
"feature_extractor": feature_extractor,
}
return components
def convert_to_pt(self, image):
image = np.array(image.convert("RGB"))
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
return image
def get_dummy_inputs(self, device="cpu", seed=0):
# TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image.cpu().permute(0, 2, 3, 1)[0]
init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"example_image": example_image,
"image": init_image,
"mask_image": mask_image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_paint_by_example_inpaint(self):
components = self.get_dummy_components()
# make sure here that pndm scheduler skips prk
pipe = PaintByExamplePipeline(**components)
pipe = pipe.to("cpu")
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs()
output = pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.4686, 0.5687, 0.4007, 0.5218, 0.5741, 0.4482, 0.4940, 0.4629, 0.4503])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_paint_by_example_image_tensor(self):
device = "cpu"
inputs = self.get_dummy_inputs()
inputs.pop("mask_image")
image = self.convert_to_pt(inputs.pop("image"))
mask_image = image.clamp(0, 1) / 2
# make sure here that pndm scheduler skips prk
pipe = PaintByExamplePipeline(**self.get_dummy_components())
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
out_1 = output.images
image = image.cpu().permute(0, 2, 3, 1)[0]
mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB")
mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
output = pipe(**self.get_dummy_inputs())
out_2 = output.images
assert out_1.shape == (1, 64, 64, 3)
assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@nightly
@require_torch_accelerator
class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_paint_by_example(self):
# make sure here that pndm scheduler skips prk
init_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/paint_by_example/dog_in_bucket.png"
)
mask_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/paint_by_example/mask.png"
)
example_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/paint_by_example/panda.jpg"
)
pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
generator = torch.manual_seed(321)
output = pipe(
image=init_image,
mask_image=mask_image,
example_image=example_image,
generator=generator,
guidance_scale=5.0,
num_inference_steps=50,
output_type="np",
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

View File

@@ -1,448 +0,0 @@
import random
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
import diffusers
from diffusers import (
AutoencoderKL,
DDIMScheduler,
DPMSolverMultistepScheduler,
LCMScheduler,
MotionAdapter,
PIAPipeline,
StableDiffusionPipeline,
UNet2DConditionModel,
UNetMotionModel,
)
from diffusers.utils import is_xformers_available, logging
from diffusers.utils.testing_utils import floats_tensor, require_accelerator, torch_device
from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
def to_np(tensor):
if isinstance(tensor, torch.Tensor):
tensor = tensor.detach().cpu().numpy()
return tensor
class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
pipeline_class = PIAPipeline
params = frozenset(
[
"prompt",
"height",
"width",
"guidance_scale",
"negative_prompt",
"prompt_embeds",
"negative_prompt_embeds",
"cross_attention_kwargs",
]
)
batch_params = frozenset(["prompt", "image", "generator"])
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback_on_step_end",
"callback_on_step_end_tensor_inputs",
]
)
test_layerwise_casting = True
test_group_offloading = True
def get_dummy_components(self):
cross_attention_dim = 8
block_out_channels = (8, 8)
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=block_out_channels,
layers_per_block=2,
sample_size=8,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=cross_attention_dim,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="linear",
clip_sample=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=block_out_channels,
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=cross_attention_dim,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
torch.manual_seed(0)
motion_adapter = MotionAdapter(
block_out_channels=block_out_channels,
motion_layers_per_block=2,
motion_norm_num_groups=2,
motion_num_attention_heads=4,
conv_in_channels=9,
)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"motion_adapter": motion_adapter,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"feature_extractor": None,
"image_encoder": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device)
inputs = {
"image": image,
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 7.5,
"output_type": "pt",
}
return inputs
def test_from_pipe_consistent_config(self):
assert self.original_pipeline_class == StableDiffusionPipeline
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
original_kwargs = {"requires_safety_checker": False}
# create original_pipeline_class(sd)
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
# original_pipeline_class(sd) -> pipeline_class
pipe_components = self.get_dummy_components()
pipe_additional_components = {}
for name, component in pipe_components.items():
if name not in pipe_original.components:
pipe_additional_components[name] = component
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
# pipeline_class -> original_pipeline_class(sd)
original_pipe_additional_components = {}
for name, component in pipe_original.components.items():
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
original_pipe_additional_components[name] = component
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
# compare the config
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
assert original_config_2 == original_config
def test_motion_unet_loading(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
assert isinstance(pipe.unet, UNetMotionModel)
def test_ip_adapter(self):
expected_pipe_slice = None
if torch_device == "cpu":
expected_pipe_slice = np.array(
[
0.5475,
0.5769,
0.4873,
0.5064,
0.4445,
0.5876,
0.5453,
0.4102,
0.5247,
0.5370,
0.3406,
0.4322,
0.3991,
0.3756,
0.5438,
0.4780,
0.5087,
0.5248,
0.6243,
0.5506,
0.3491,
0.5440,
0.6111,
0.5122,
0.5326,
0.5180,
0.5538,
]
)
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467])
return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
@unittest.skip("Attention slicing is not enabled in this pipeline")
def test_attention_slicing_forward_pass(self):
pass
def test_inference_batch_single_identical(
self,
batch_size=2,
expected_max_diff=1e-4,
additional_params_copy_to_batched_inputs=["num_inference_steps"],
):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for components in pipe.components.values():
if hasattr(components, "set_default_attn_processor"):
components.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
# Reset generator in case it is has been used in self.get_dummy_inputs
inputs["generator"] = self.get_generator(0)
logger = logging.get_logger(pipe.__module__)
logger.setLevel(level=diffusers.logging.FATAL)
# batchify inputs
batched_inputs = {}
batched_inputs.update(inputs)
for name in self.batch_params:
if name not in inputs:
continue
value = inputs[name]
if name == "prompt":
len_prompt = len(value)
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
batched_inputs[name][-1] = 100 * "very long"
else:
batched_inputs[name] = batch_size * [value]
if "generator" in inputs:
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
if "batch_size" in inputs:
batched_inputs["batch_size"] = batch_size
for arg in additional_params_copy_to_batched_inputs:
batched_inputs[arg] = inputs[arg]
output = pipe(**inputs)
output_batch = pipe(**batched_inputs)
assert output_batch[0].shape[0] == batch_size
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
assert max_diff < expected_max_diff
@require_accelerator
def test_to_device(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to("cpu")
# pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
model_devices = [
component.device.type for component in pipe.components.values() if hasattr(component, "device")
]
self.assertTrue(all(device == "cpu" for device in model_devices))
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
self.assertTrue(np.isnan(output_cpu).sum() == 0)
pipe.to(torch_device)
model_devices = [
component.device.type for component in pipe.components.values() if hasattr(component, "device")
]
self.assertTrue(all(device == torch_device for device in model_devices))
output_device = pipe(**self.get_dummy_inputs(torch_device))[0]
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
def test_to_dtype(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
# pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
pipe.to(dtype=torch.float16)
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
def test_prompt_embeds(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to(torch_device)
inputs = self.get_dummy_inputs(torch_device)
inputs.pop("prompt")
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
pipe(**inputs)
def test_free_init(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to(torch_device)
inputs_normal = self.get_dummy_inputs(torch_device)
frames_normal = pipe(**inputs_normal).frames[0]
pipe.enable_free_init(
num_iters=2,
use_fast_sampling=True,
method="butterworth",
order=4,
spatial_stop_frequency=0.25,
temporal_stop_frequency=0.25,
)
inputs_enable_free_init = self.get_dummy_inputs(torch_device)
frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
pipe.disable_free_init()
inputs_disable_free_init = self.get_dummy_inputs(torch_device)
frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
self.assertGreater(
sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
)
self.assertLess(
max_diff_disabled,
1e-4,
"Disabling of FreeInit should lead to results similar to the default pipeline results",
)
def test_free_init_with_schedulers(self):
components = self.get_dummy_components()
pipe: PIAPipeline = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to(torch_device)
inputs_normal = self.get_dummy_inputs(torch_device)
frames_normal = pipe(**inputs_normal).frames[0]
schedulers_to_test = [
DPMSolverMultistepScheduler.from_config(
components["scheduler"].config,
timestep_spacing="linspace",
beta_schedule="linear",
algorithm_type="dpmsolver++",
steps_offset=1,
clip_sample=False,
),
LCMScheduler.from_config(
components["scheduler"].config,
timestep_spacing="linspace",
beta_schedule="linear",
steps_offset=1,
clip_sample=False,
),
]
components.pop("scheduler")
for scheduler in schedulers_to_test:
components["scheduler"] = scheduler
pipe: PIAPipeline = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to(torch_device)
pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
inputs = self.get_dummy_inputs(torch_device)
frames_enable_free_init = pipe(**inputs).frames[0]
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
self.assertGreater(
sum_enabled,
1e1,
"Enabling of FreeInit should lead to results different from the default pipeline results",
)
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
for component in pipe.components.values():
if hasattr(component, "set_default_attn_processor"):
component.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
output_without_offload = pipe(**inputs).frames[0]
output_without_offload = (
output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
)
pipe.enable_xformers_memory_efficient_attention()
inputs = self.get_dummy_inputs(torch_device)
output_with_offload = pipe(**inputs).frames[0]
output_with_offload = (
output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
)
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"num_images_per_prompt": 1,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)

View File

@@ -1,617 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import tempfile
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
nightly,
require_torch_accelerator,
torch_device,
)
enable_full_determinism()
class SafeDiffusionPipelineFastTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
@property
def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = (32, 32)
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
return image
@property
def dummy_cond_unet(self):
torch.manual_seed(0)
model = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
return model
@property
def dummy_vae(self):
torch.manual_seed(0)
model = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
return model
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config)
@property
def dummy_extractor(self):
def extract(*args, **kwargs):
class Out:
def __init__(self):
self.pixel_values = torch.ones([0])
def to(self, device):
self.pixel_values.to(device)
return self
return Out()
return extract
def test_semantic_diffusion_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unet = self.dummy_cond_unet
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = torch.Generator(device=device).manual_seed(0)
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
image = output.images
generator = torch.Generator(device=device).manual_seed(0)
image_from_tuple = sd_pipe(
[prompt],
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5753, 0.6114, 0.5001, 0.5034, 0.5470, 0.4729, 0.4971, 0.4867, 0.4867])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_semantic_diffusion_pndm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = torch.Generator(device=device).manual_seed(0)
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
image = output.images
generator = torch.Generator(device=device).manual_seed(0)
image_from_tuple = sd_pipe(
[prompt],
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5122, 0.5712, 0.4825, 0.5053, 0.5646, 0.4769, 0.5179, 0.4894, 0.4994])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_semantic_diffusion_no_safety_checker(self):
pipe = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
)
assert isinstance(pipe, StableDiffusionPipeline)
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
# check that there's no error when saving a pipeline with one of the models being None
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
# sanity check that the pipeline still works
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
@require_torch_accelerator
def test_semantic_diffusion_fp16(self):
"""Test that stable diffusion works with fp16"""
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# put models in fp16
unet = unet.half()
vae = vae.half()
bert = bert.half()
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
assert image.shape == (1, 64, 64, 3)
@nightly
@require_torch_accelerator
class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_positive_guidance(self):
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
prompt = "a photo of a cat"
edit = {
"editing_prompt": ["sunglasses"],
"reverse_editing_direction": [False],
"edit_warmup_steps": 10,
"edit_guidance_scale": 6,
"edit_threshold": 0.95,
"edit_momentum_scale": 0.5,
"edit_mom_beta": 0.6,
}
seed = 3
guidance_scale = 7
# no sega enabled
generator = torch.Generator(torch_device)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.34673113,
0.38492733,
0.37597352,
0.34086335,
0.35650748,
0.35579205,
0.3384763,
0.34340236,
0.3573271,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# with sega enabled
# generator = torch.manual_seed(seed)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
**edit,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.41887826,
0.37728766,
0.30138272,
0.41416335,
0.41664985,
0.36283392,
0.36191246,
0.43364465,
0.43001732,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_negative_guidance(self):
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
prompt = "an image of a crowded boulevard, realistic, 4k"
edit = {
"editing_prompt": "crowd, crowded, people",
"reverse_editing_direction": True,
"edit_warmup_steps": 10,
"edit_guidance_scale": 8.3,
"edit_threshold": 0.9,
"edit_momentum_scale": 0.5,
"edit_mom_beta": 0.6,
}
seed = 9
guidance_scale = 7
# no sega enabled
generator = torch.Generator(torch_device)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.43497998,
0.91814065,
0.7540739,
0.55580205,
0.8467265,
0.5389691,
0.62574506,
0.58897763,
0.50926757,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# with sega enabled
# generator = torch.manual_seed(seed)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
**edit,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.3089719,
0.30500144,
0.29016042,
0.30630964,
0.325687,
0.29419225,
0.2908091,
0.28723598,
0.27696294,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_multi_cond_guidance(self):
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
prompt = "a castle next to a river"
edit = {
"editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
"reverse_editing_direction": False,
"edit_warmup_steps": [15, 18],
"edit_guidance_scale": 6,
"edit_threshold": [0.9, 0.8],
"edit_momentum_scale": 0.5,
"edit_mom_beta": 0.6,
}
seed = 48
guidance_scale = 7
# no sega enabled
generator = torch.Generator(torch_device)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.75163555,
0.76037145,
0.61785,
0.9189673,
0.8627701,
0.85189694,
0.8512813,
0.87012076,
0.8312857,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# with sega enabled
# generator = torch.manual_seed(seed)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
**edit,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.73553365,
0.7537271,
0.74341905,
0.66480356,
0.6472925,
0.63039416,
0.64812905,
0.6749717,
0.6517102,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_guidance_fp16(self):
pipe = StableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
prompt = "a photo of a cat"
edit = {
"editing_prompt": ["sunglasses"],
"reverse_editing_direction": [False],
"edit_warmup_steps": 10,
"edit_guidance_scale": 6,
"edit_threshold": 0.95,
"edit_momentum_scale": 0.5,
"edit_mom_beta": 0.6,
}
seed = 3
guidance_scale = 7
# no sega enabled
generator = torch.Generator(torch_device)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.34887695,
0.3876953,
0.375,
0.34423828,
0.3581543,
0.35717773,
0.3383789,
0.34570312,
0.359375,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# with sega enabled
# generator = torch.manual_seed(seed)
generator.manual_seed(seed)
output = pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
**edit,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [
0.42285156,
0.36914062,
0.29077148,
0.42041016,
0.41918945,
0.35498047,
0.3618164,
0.4423828,
0.43115234,
]
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

View File

@@ -1,267 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
StableDiffusionAttendAndExcitePipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
load_numpy,
nightly,
numpy_cosine_similarity_distance,
require_torch_accelerator,
skip_mps,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import (
PipelineFromPipeTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
torch.backends.cuda.matmul.allow_tf32 = False
@skip_mps
class StableDiffusionAttendAndExcitePipelineFastTests(
PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin,
PipelineFromPipeTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionAttendAndExcitePipeline
test_attention_slicing = False
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
# Attend and excite requires being able to run a backward pass at
# inference time. There's no deterministic backward operator for pad
@classmethod
def setUpClass(cls):
super().setUpClass()
torch.use_deterministic_algorithms(False)
@classmethod
def tearDownClass(cls):
super().tearDownClass()
torch.use_deterministic_algorithms(True)
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
# SD2-specific config below
attention_head_dim=(2, 4),
use_linear_projection=True,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
sample_size=128,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
# SD2-specific config below
hidden_act="gelu",
projection_dim=512,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "a cat and a frog",
"token_indices": [2, 5],
"generator": generator,
"num_inference_steps": 1,
"guidance_scale": 6.0,
"output_type": "np",
"max_iter_to_alter": 2,
"thresholds": {0: 0.7},
}
return inputs
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.6391, 0.6290, 0.4860, 0.5134, 0.5550, 0.4577, 0.5033, 0.5023, 0.4538])
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice, expected_max_difference=3e-3)
def test_inference(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
self.assertEqual(image.shape, (1, 64, 64, 3))
expected_slice = np.array(
[0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496]
)
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
def test_sequential_cpu_offload_forward_pass(self):
super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
def test_inference_batch_consistent(self):
# NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
self._test_inference_batch_consistent(batch_sizes=[1, 2])
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4)
def test_pt_np_pil_outputs_equivalent(self):
super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4)
def test_save_load_local(self):
super().test_save_load_local(expected_max_difference=5e-4)
def test_save_load_optional_components(self):
super().test_save_load_optional_components(expected_max_difference=4e-4)
def test_karras_schedulers_shape(self):
super().test_karras_schedulers_shape(num_inference_steps_for_strength_for_iterations=3)
def test_from_pipe_consistent_forward_pass_cpu_offload(self):
super().test_from_pipe_consistent_forward_pass_cpu_offload(expected_max_diff=5e-3)
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@require_torch_accelerator
@nightly
class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
# Attend and excite requires being able to run a backward pass at
# inference time. There's no deterministic backward operator for pad
@classmethod
def setUpClass(cls):
super().setUpClass()
torch.use_deterministic_algorithms(False)
@classmethod
def tearDownClass(cls):
super().tearDownClass()
torch.use_deterministic_algorithms(True)
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_attend_and_excite_fp16(self):
generator = torch.manual_seed(51)
pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
)
pipe.to(torch_device)
prompt = "a painting of an elephant with glasses"
token_indices = [5, 7]
image = pipe(
prompt=prompt,
token_indices=token_indices,
guidance_scale=7.5,
generator=generator,
num_inference_steps=5,
max_iter_to_alter=5,
output_type="np",
).images[0]
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
)
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
assert max_diff < 5e-1

View File

@@ -1,452 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import tempfile
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMInverseScheduler,
DDIMScheduler,
DPMSolverMultistepInverseScheduler,
DPMSolverMultistepScheduler,
StableDiffusionDiffEditPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
nightly,
numpy_cosine_similarity_distance,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
enable_full_determinism()
class StableDiffusionDiffEditPipelineFastTests(
PipelineLatentTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
):
pipeline_class = StableDiffusionDiffEditPipeline
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"}
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"}
image_params = frozenset(
[]
) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
image_latents_params = frozenset([])
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
# SD2-specific config below
attention_head_dim=(2, 4),
use_linear_projection=True,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
inverse_scheduler = DDIMInverseScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_zero=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
sample_size=128,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
# SD2-specific config below
hidden_act="gelu",
projection_dim=512,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"inverse_scheduler": inverse_scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
mask = floats_tensor((1, 16, 16), rng=random.Random(seed)).to(device)
latents = floats_tensor((1, 2, 4, 16, 16), rng=random.Random(seed)).to(device)
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "a dog and a newt",
"mask_image": mask,
"image_latents": latents,
"generator": generator,
"num_inference_steps": 2,
"inpaint_strength": 1.0,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def get_dummy_mask_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB")
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"image": image,
"source_prompt": "a cat and a frog",
"target_prompt": "a dog and a newt",
"generator": generator,
"num_inference_steps": 2,
"num_maps_per_mask": 2,
"mask_encode_strength": 1.0,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def get_dummy_inversion_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB")
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"image": image,
"prompt": "a cat and a frog",
"generator": generator,
"num_inference_steps": 2,
"inpaint_strength": 1.0,
"guidance_scale": 6.0,
"decode_latents": True,
"output_type": "np",
}
return inputs
def test_save_load_optional_components(self):
if not hasattr(self.pipeline_class, "_optional_components"):
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
# set all optional components to None and update pipeline config accordingly
for optional_component in pipe._optional_components:
setattr(pipe, optional_component, None)
pipe.register_modules(**dict.fromkeys(pipe._optional_components))
inputs = self.get_dummy_inputs(torch_device)
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir)
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
pipe_loaded.to(torch_device)
pipe_loaded.set_progress_bar_config(disable=None)
for optional_component in pipe._optional_components:
self.assertTrue(
getattr(pipe_loaded, optional_component) is None,
f"`{optional_component}` did not stay set to None after loading.",
)
inputs = self.get_dummy_inputs(torch_device)
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(output - output_loaded).max()
self.assertLess(max_diff, 1e-4)
def test_mask(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_mask_inputs(device)
mask = pipe.generate_mask(**inputs)
mask_slice = mask[0, -3:, -3:]
self.assertEqual(mask.shape, (1, 16, 16))
expected_slice = np.array([0] * 9)
max_diff = np.abs(mask_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
self.assertEqual(mask[0, -3, -4], 0)
def test_inversion(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inversion_inputs(device)
image = pipe.invert(**inputs).images
image_slice = image[0, -1, -3:, -3:]
self.assertEqual(image.shape, (2, 32, 32, 3))
expected_slice = np.array(
[0.5160, 0.5115, 0.5060, 0.5456, 0.4704, 0.5060, 0.5019, 0.4405, 0.4726],
)
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-3)
def test_inversion_dpm(self):
device = "cpu"
components = self.get_dummy_components()
scheduler_args = {"beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear"}
components["scheduler"] = DPMSolverMultistepScheduler(**scheduler_args)
components["inverse_scheduler"] = DPMSolverMultistepInverseScheduler(**scheduler_args)
pipe = self.pipeline_class(**components)
pipe.to(device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inversion_inputs(device)
image = pipe.invert(**inputs).images
image_slice = image[0, -1, -3:, -3:]
self.assertEqual(image.shape, (2, 32, 32, 3))
expected_slice = np.array(
[0.5305, 0.4673, 0.5314, 0.5308, 0.4886, 0.5279, 0.5142, 0.4724, 0.4892],
)
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
self.assertLessEqual(max_diff, 1e-3)
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@require_torch_accelerator
@nightly
class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
@classmethod
def setUpClass(cls):
raw_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
)
raw_image = raw_image.convert("RGB").resize((256, 256))
cls.raw_image = raw_image
def test_stable_diffusion_diffedit_full(self):
generator = torch.manual_seed(0)
pipe = StableDiffusionDiffEditPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-base", safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.scheduler.clip_sample = True
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload(device=torch_device)
pipe.set_progress_bar_config(disable=None)
source_prompt = "a bowl of fruit"
target_prompt = "a bowl of pears"
mask_image = pipe.generate_mask(
image=self.raw_image,
source_prompt=source_prompt,
target_prompt=target_prompt,
generator=generator,
)
inv_latents = pipe.invert(
prompt=source_prompt,
image=self.raw_image,
inpaint_strength=0.7,
generator=generator,
num_inference_steps=5,
).latents
image = pipe(
prompt=target_prompt,
mask_image=mask_image,
image_latents=inv_latents,
generator=generator,
negative_prompt=source_prompt,
inpaint_strength=0.7,
num_inference_steps=5,
output_type="np",
).images[0]
expected_image = (
np.array(
load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/diffedit/pears.png"
).resize((256, 256))
)
/ 255
)
assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 2e-1
@nightly
@require_torch_accelerator
class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
@classmethod
def setUpClass(cls):
raw_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
)
raw_image = raw_image.convert("RGB").resize((768, 768))
cls.raw_image = raw_image
def test_stable_diffusion_diffedit_dpm(self):
generator = torch.manual_seed(0)
pipe = StableDiffusionDiffEditPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.inverse_scheduler = DPMSolverMultistepInverseScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()
pipe.set_progress_bar_config(disable=None)
source_prompt = "a bowl of fruit"
target_prompt = "a bowl of pears"
mask_image = pipe.generate_mask(
image=self.raw_image,
source_prompt=source_prompt,
target_prompt=target_prompt,
generator=generator,
)
inv_latents = pipe.invert(
prompt=source_prompt,
image=self.raw_image,
inpaint_strength=0.7,
generator=generator,
num_inference_steps=25,
).latents
image = pipe(
prompt=target_prompt,
mask_image=mask_image,
image_latents=inv_latents,
generator=generator,
negative_prompt=source_prompt,
inpaint_strength=0.7,
num_inference_steps=25,
output_type="np",
).images[0]
expected_image = (
np.array(
load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/diffedit/pears.png"
).resize((768, 768))
)
/ 255
)
assert np.abs((expected_image - image).max()) < 5e-1

View File

@@ -1,175 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
EulerAncestralDiscreteScheduler,
StableDiffusionGLIGENPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import enable_full_determinism
from ..pipeline_params import (
TEXT_TO_IMAGE_BATCH_PARAMS,
TEXT_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_PARAMS,
)
from ..test_pipelines_common import (
PipelineFromPipeTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
enable_full_determinism()
class GligenPipelineFastTests(
PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin,
PipelineFromPipeTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionGLIGENPipeline
params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_boxes"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_type="gated",
)
# unet.position_net = PositionNet(32,32)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
sample_size=128,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A modern livingroom",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"gligen_phrases": ["a birthday cake"],
"gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
"output_type": "np",
}
return inputs
def test_stable_diffusion_gligen_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionGLIGENPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_gligen_k_euler_ancestral(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionGLIGENPipeline(**components)
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = sd_pipe(**inputs)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
@unittest.skip("Test not supported as tokenizer is used for parsing bounding boxes.")
def test_encode_prompt_works_in_isolation(self):
pass

View File

@@ -1,215 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import (
CLIPProcessor,
CLIPTextConfig,
CLIPTextModel,
CLIPTokenizer,
CLIPVisionConfig,
CLIPVisionModelWithProjection,
)
from diffusers import (
AutoencoderKL,
DDIMScheduler,
EulerAncestralDiscreteScheduler,
StableDiffusionGLIGENTextImagePipeline,
UNet2DConditionModel,
)
from diffusers.pipelines.stable_diffusion import CLIPImageProjection
from diffusers.utils import load_image
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
from ..pipeline_params import (
TEXT_TO_IMAGE_BATCH_PARAMS,
TEXT_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_PARAMS,
)
from ..test_pipelines_common import (
PipelineFromPipeTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
enable_full_determinism()
class GligenTextImagePipelineFastTests(
PipelineLatentTesterMixin,
PipelineKarrasSchedulerTesterMixin,
PipelineTesterMixin,
PipelineFromPipeTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionGLIGENTextImagePipeline
params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_images", "gligen_boxes"}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
supports_dduf = False
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
attention_type="gated-text-image",
)
# unet.position_net = PositionNet(32,32)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
sample_size=128,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
image_encoder_config = CLIPVisionConfig(
hidden_size=32,
projection_dim=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
)
image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
image_project = CLIPImageProjection(hidden_size=32)
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": image_encoder,
"image_project": image_project,
"processor": processor,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
gligen_images = load_image(
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
)
inputs = {
"prompt": "A modern livingroom",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"gligen_phrases": ["a birthday cake"],
"gligen_images": [gligen_images],
"gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
"output_type": "np",
}
return inputs
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.5052, 0.5546, 0.4567, 0.4770, 0.5195, 0.4085, 0.5026, 0.4909, 0.4495])
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
def test_stable_diffusion_gligen_text_image_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_gligen_k_euler_ancestral(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_attention_slicing_forward_pass(self):
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
@unittest.skip(
"Test not supported because of the use of `text_encoder` in `get_cross_attention_kwargs_with_grounded()`."
)
def test_encode_prompt_works_in_isolation(self):
pass

View File

@@ -1,326 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
PNDMScheduler,
StableDiffusionLDM3DPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
enable_full_determinism()
class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
pipeline_class = StableDiffusionLDM3DPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=6,
out_channels=6,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_stable_diffusion_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
ldm3d_pipe = ldm3d_pipe.to(torch_device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = ldm3d_pipe(**inputs)
rgb, depth = output.rgb, output.depth
image_slice_rgb = rgb[0, -3:, -3:, -1]
image_slice_depth = depth[0, -3:, -1]
assert rgb.shape == (1, 64, 64, 3)
assert depth.shape == (1, 64, 64)
expected_slice_rgb = np.array(
[0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262]
)
expected_slice_depth = np.array([103.46727, 85.812004, 87.849236])
assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2
assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2
def test_stable_diffusion_prompt_embeds(self):
components = self.get_dummy_components()
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
ldm3d_pipe = ldm3d_pipe.to(torch_device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(torch_device)
inputs["prompt"] = 3 * [inputs["prompt"]]
# forward
output = ldm3d_pipe(**inputs)
rgb_slice_1, depth_slice_1 = output.rgb, output.depth
rgb_slice_1 = rgb_slice_1[0, -3:, -3:, -1]
depth_slice_1 = depth_slice_1[0, -3:, -1]
inputs = self.get_dummy_inputs(torch_device)
prompt = 3 * [inputs.pop("prompt")]
text_inputs = ldm3d_pipe.tokenizer(
prompt,
padding="max_length",
max_length=ldm3d_pipe.tokenizer.model_max_length,
truncation=True,
return_tensors="pt",
)
text_inputs = text_inputs["input_ids"].to(torch_device)
prompt_embeds = ldm3d_pipe.text_encoder(text_inputs)[0]
inputs["prompt_embeds"] = prompt_embeds
# forward
output = ldm3d_pipe(**inputs)
rgb_slice_2, depth_slice_2 = output.rgb, output.depth
rgb_slice_2 = rgb_slice_2[0, -3:, -3:, -1]
depth_slice_2 = depth_slice_2[0, -3:, -1]
assert np.abs(rgb_slice_1.flatten() - rgb_slice_2.flatten()).max() < 1e-4
assert np.abs(depth_slice_1.flatten() - depth_slice_2.flatten()).max() < 1e-4
def test_stable_diffusion_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
ldm3d_pipe = ldm3d_pipe.to(device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
negative_prompt = "french fries"
output = ldm3d_pipe(**inputs, negative_prompt=negative_prompt)
rgb, depth = output.rgb, output.depth
rgb_slice = rgb[0, -3:, -3:, -1]
depth_slice = depth[0, -3:, -1]
assert rgb.shape == (1, 64, 64, 3)
assert depth.shape == (1, 64, 64)
expected_slice_rgb = np.array(
[0.37044, 0.71811503, 0.7223251, 0.48603675, 0.5638391, 0.6364948, 0.42833704, 0.4901315, 0.47926217]
)
expected_slice_depth = np.array([107.84738, 84.62802, 89.962135])
assert np.abs(rgb_slice.flatten() - expected_slice_rgb).max() < 1e-2
assert np.abs(depth_slice.flatten() - expected_slice_depth).max() < 1e-2
@nightly
@require_torch_accelerator
class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
inputs = {
"prompt": "a photograph of an astronaut riding a horse",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 7.5,
"output_type": "np",
}
return inputs
def test_ldm3d_stable_diffusion(self):
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d")
ldm3d_pipe = ldm3d_pipe.to(torch_device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
output = ldm3d_pipe(**inputs)
rgb, depth = output.rgb, output.depth
rgb_slice = rgb[0, -3:, -3:, -1].flatten()
depth_slice = rgb[0, -3:, -1].flatten()
assert rgb.shape == (1, 512, 512, 3)
assert depth.shape == (1, 512, 512)
expected_slice_rgb = np.array(
[0.53805465, 0.56707305, 0.5486515, 0.57012236, 0.5814511, 0.56253487, 0.54843014, 0.55092263, 0.6459706]
)
expected_slice_depth = np.array(
[0.9263781, 0.6678672, 0.5486515, 0.92202145, 0.67831135, 0.56253487, 0.9241694, 0.7551478, 0.6459706]
)
assert np.abs(rgb_slice - expected_slice_rgb).max() < 3e-3
assert np.abs(depth_slice - expected_slice_depth).max() < 3e-3
@nightly
@require_torch_accelerator
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
inputs = {
"prompt": "a photograph of an astronaut riding a horse",
"latents": latents,
"generator": generator,
"num_inference_steps": 50,
"guidance_scale": 7.5,
"output_type": "np",
}
return inputs
def test_ldm3d(self):
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d").to(torch_device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
output = ldm3d_pipe(**inputs)
rgb, depth = output.rgb, output.depth
expected_rgb_mean = 0.495586
expected_rgb_std = 0.33795515
expected_depth_mean = 112.48518
expected_depth_std = 98.489746
assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
assert np.abs(expected_depth_std - depth.std()) < 1e-3
def test_ldm3d_v2(self):
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device)
ldm3d_pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
output = ldm3d_pipe(**inputs)
rgb, depth = output.rgb, output.depth
expected_rgb_mean = 0.4194127
expected_rgb_std = 0.35375586
expected_depth_mean = 0.5638502
expected_depth_std = 0.34686103
assert rgb.shape == (1, 512, 512, 3)
assert depth.shape == (1, 512, 512, 1)
assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
assert np.abs(expected_depth_std - depth.std()) < 1e-3

View File

@@ -1,444 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
EulerAncestralDiscreteScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
StableDiffusionPanoramaPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism,
nightly,
require_torch_accelerator,
skip_mps,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import (
IPAdapterTesterMixin,
PipelineFromPipeTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
enable_full_determinism()
@skip_mps
class StableDiffusionPanoramaPipelineFastTests(
IPAdapterTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
PipelineFromPipeTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionPanoramaPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
scheduler = DDIMScheduler()
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "a photo of the dolomites",
"generator": generator,
# Setting height and width to None to prevent OOMs on CPU.
"height": None,
"width": None,
"num_inference_steps": 1,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_stable_diffusion_panorama_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_panorama_circular_padding_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs, circular_padding=True).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# override to speed the overall test timing up.
def test_inference_batch_consistent(self):
super().test_inference_batch_consistent(batch_sizes=[1, 2])
# override to speed the overall test timing up.
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=5.0e-3)
def test_float16_inference(self):
super().test_float16_inference(expected_max_diff=1e-1)
def test_stable_diffusion_panorama_negative_prompt(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
negative_prompt = "french fries"
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_panorama_views_batch(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = sd_pipe(**inputs, view_batch_size=2)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_panorama_views_batch_circular_padding(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = sd_pipe(**inputs, circular_padding=True, view_batch_size=2)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_panorama_euler(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = EulerAncestralDiscreteScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
)
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.4024, 0.6510, 0.4901, 0.5378, 0.5813, 0.5622, 0.4795, 0.4467, 0.4952])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_panorama_pndm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
components["scheduler"] = PNDMScheduler(
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
)
sd_pipe = StableDiffusionPanoramaPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
image = sd_pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly
@require_torch_accelerator
class StableDiffusionPanoramaNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, seed=0):
generator = torch.manual_seed(seed)
inputs = {
"prompt": "a photo of the dolomites",
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 7.5,
"output_type": "np",
}
return inputs
def test_stable_diffusion_panorama_default(self):
model_ckpt = "stabilityai/stable-diffusion-2-base"
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 2048, 3)
expected_slice = np.array(
[
0.36968392,
0.27025372,
0.32446766,
0.28379387,
0.36363274,
0.30733347,
0.27100027,
0.27054125,
0.25536096,
]
)
assert np.abs(expected_slice - image_slice).max() < 1e-2
def test_stable_diffusion_panorama_k_lms(self):
pipe = StableDiffusionPanoramaPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-base", safety_checker=None
)
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.unet.set_default_attn_processor()
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
image = pipe(**inputs).images
image_slice = image[0, -3:, -3:, -1].flatten()
assert image.shape == (1, 512, 2048, 3)
expected_slice = np.array(
[
[
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
]
]
)
assert np.abs(expected_slice - image_slice).max() < 1e-2
def test_stable_diffusion_panorama_intermediate_state(self):
number_of_steps = 0
def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
callback_fn.has_been_called = True
nonlocal number_of_steps
number_of_steps += 1
if step == 1:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 256)
latents_slice = latents[0, -3:, -3:, -1]
expected_slice = np.array(
[
0.18681869,
0.33907816,
0.5361276,
0.14432865,
-0.02856611,
-0.73941123,
0.23397987,
0.47322682,
-0.37823164,
]
)
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
elif step == 2:
latents = latents.detach().cpu().numpy()
assert latents.shape == (1, 4, 64, 256)
latents_slice = latents[0, -3:, -3:, -1]
expected_slice = np.array(
[
0.18539645,
0.33987248,
0.5378559,
0.14437142,
-0.02455261,
-0.7338317,
0.23990755,
0.47356272,
-0.3786505,
]
)
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
callback_fn.has_been_called = False
model_ckpt = "stabilityai/stable-diffusion-2-base"
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs()
pipe(**inputs, callback=callback_fn, callback_steps=1)
assert callback_fn.has_been_called
assert number_of_steps == 3
def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):
backend_empty_cache(torch_device)
backend_reset_max_memory_allocated(torch_device)
backend_reset_peak_memory_stats(torch_device)
model_ckpt = "stabilityai/stable-diffusion-2-base"
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing(1)
pipe.enable_sequential_cpu_offload()
inputs = self.get_inputs()
_ = pipe(**inputs)
mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 5.2 GB is allocated
assert mem_bytes < 5.5 * 10**9

View File

@@ -1,497 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import tempfile
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
from diffusers.utils.testing_utils import (
Expectations,
backend_empty_cache,
floats_tensor,
nightly,
require_accelerator,
require_torch_accelerator,
torch_device,
)
class SafeDiffusionPipelineFastTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
@property
def dummy_image(self):
batch_size = 1
num_channels = 3
sizes = (32, 32)
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
return image
@property
def dummy_cond_unet(self):
torch.manual_seed(0)
model = UNet2DConditionModel(
block_out_channels=(32, 64),
layers_per_block=2,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
)
return model
@property
def dummy_vae(self):
torch.manual_seed(0)
model = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
return model
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config)
@property
def dummy_extractor(self):
def extract(*args, **kwargs):
class Out:
def __init__(self):
self.pixel_values = torch.ones([0])
def to(self, device):
self.pixel_values.to(device)
return self
return Out()
return extract
def test_safe_diffusion_ddim(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unet = self.dummy_cond_unet
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = torch.Generator(device=device).manual_seed(0)
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
image = output.images
generator = torch.Generator(device=device).manual_seed(0)
image_from_tuple = sd_pipe(
[prompt],
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_pndm(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
generator = torch.Generator(device=device).manual_seed(0)
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
image = output.images
generator = torch.Generator(device=device).manual_seed(0)
image_from_tuple = sd_pipe(
[prompt],
generator=generator,
guidance_scale=6.0,
num_inference_steps=2,
output_type="np",
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.5125, 0.5716, 0.4828, 0.5060, 0.5650, 0.4768, 0.5185, 0.4895, 0.4993])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_stable_diffusion_no_safety_checker(self):
pipe = StableDiffusionPipeline.from_pretrained(
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
)
assert isinstance(pipe, StableDiffusionPipeline)
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
# check that there's no error when saving a pipeline with one of the models being None
with tempfile.TemporaryDirectory() as tmpdirname:
pipe.save_pretrained(tmpdirname)
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
# sanity check that the pipeline still works
assert pipe.safety_checker is None
image = pipe("example prompt", num_inference_steps=2).images[0]
assert image is not None
@require_accelerator
def test_stable_diffusion_fp16(self):
"""Test that stable diffusion works with fp16"""
unet = self.dummy_cond_unet
scheduler = PNDMScheduler(skip_prk_steps=True)
vae = self.dummy_vae
bert = self.dummy_text_encoder
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
# put models in fp16
unet = unet.half()
vae = vae.half()
bert = bert.half()
# make sure here that pndm scheduler skips prk
sd_pipe = StableDiffusionPipeline(
unet=unet,
scheduler=scheduler,
vae=vae,
text_encoder=bert,
tokenizer=tokenizer,
safety_checker=None,
feature_extractor=self.dummy_extractor,
)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "A painting of a squirrel eating a burger"
image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
assert image.shape == (1, 64, 64, 3)
@nightly
@require_torch_accelerator
class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_harm_safe_stable_diffusion(self):
sd_pipe = StableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
)
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = (
"portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle"
" coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with"
" anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and"
" children from bahnhof zoo, detailed "
)
seed = 4003660346
guidance_scale = 7
# without safety guidance (sld_guidance_scale = 0)
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=0,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slices = Expectations(
{
("xpu", 3): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0],
("cuda", 7): [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176],
("cuda", 8): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0],
}
)
expected_slice = expected_slices.get_expectation()
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
# without safety guidance (strong configuration)
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=2000,
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
sld_mom_beta=0.7,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
expected_slices = Expectations(
{
("xpu", 3): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293],
("cuda", 7): [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719],
("cuda", 8): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293],
}
)
expected_slice = expected_slices.get_expectation()
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_nudity_safe_stable_diffusion(self):
sd_pipe = StableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
)
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
seed = 2734971755
guidance_scale = 7
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=0,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slices = Expectations(
{
("xpu", 3): [0.3244, 0.3355, 0.3260, 0.3123, 0.3246, 0.3426, 0.3109, 0.3471, 0.4001],
("cuda", 7): [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297],
("cuda", 8): [0.3605, 0.3684, 0.3712, 0.3624, 0.3675, 0.3726, 0.3494, 0.3748, 0.4044],
}
)
expected_slice = expected_slices.get_expectation()
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=2000,
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
sld_mom_beta=0.7,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slices = Expectations(
{
("xpu", 3): [0.6178, 0.6260, 0.6194, 0.6435, 0.6265, 0.6461, 0.6567, 0.6576, 0.6444],
("cuda", 7): [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443],
("cuda", 8): [0.5892, 0.5959, 0.5914, 0.6123, 0.5982, 0.6141, 0.6180, 0.6262, 0.6171],
}
)
print(f"image_slice: {image_slice.flatten()}")
expected_slice = expected_slices.get_expectation()
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
def test_nudity_safetychecker_safe_stable_diffusion(self):
sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
sd_pipe = sd_pipe.to(torch_device)
sd_pipe.set_progress_bar_config(disable=None)
prompt = (
"the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c."
" leyendecker"
)
seed = 1044355234
guidance_scale = 12
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=0,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7
generator = torch.manual_seed(seed)
output = sd_pipe(
[prompt],
generator=generator,
guidance_scale=guidance_scale,
num_inference_steps=50,
output_type="np",
width=512,
height=512,
sld_guidance_scale=2000,
sld_warmup_steps=7,
sld_threshold=0.025,
sld_momentum_scale=0.5,
sld_mom_beta=0.7,
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
expected_slices = Expectations(
{
("xpu", 3): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]),
("cuda", 7): np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561]),
("cuda", 8): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]),
}
)
expected_slice = expected_slices.get_expectation()
assert image.shape == (1, 512, 512, 3)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

View File

@@ -1,245 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
DEISMultistepScheduler,
DPMSolverMultistepScheduler,
EulerDiscreteScheduler,
StableDiffusionSAGPipeline,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import (
IPAdapterTesterMixin,
PipelineFromPipeTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
)
enable_full_determinism()
class StableDiffusionSAGPipelineFastTests(
IPAdapterTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
PipelineFromPipeTesterMixin,
unittest.TestCase,
):
pipeline_class = StableDiffusionSAGPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
layers_per_block=2,
sample_size=8,
norm_num_groups=1,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=8,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[4, 8],
norm_num_groups=1,
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=8,
num_hidden_layers=2,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"image_encoder": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": ".",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 1.0,
"sag_scale": 1.0,
"output_type": "np",
}
return inputs
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
@unittest.skip("Not necessary to test here.")
def test_xformers_attention_forwardGenerator_pass(self):
pass
def test_pipeline_different_schedulers(self):
pipeline = self.pipeline_class(**self.get_dummy_components())
inputs = self.get_dummy_inputs("cpu")
expected_image_size = (16, 16, 3)
for scheduler_cls in [DDIMScheduler, DEISMultistepScheduler, DPMSolverMultistepScheduler]:
pipeline.scheduler = scheduler_cls.from_config(pipeline.scheduler.config)
image = pipeline(**inputs).images[0]
shape = image.shape
assert shape == expected_image_size
pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
with self.assertRaises(ValueError):
# Karras schedulers are not supported
image = pipeline(**inputs).images[0]
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly
@require_torch_accelerator
class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_stable_diffusion_1(self):
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
sag_pipe = sag_pipe.to(torch_device)
sag_pipe.set_progress_bar_config(disable=None)
prompt = "."
generator = torch.manual_seed(0)
output = sag_pipe(
[prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949])
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
def test_stable_diffusion_2(self):
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
sag_pipe = sag_pipe.to(torch_device)
sag_pipe.set_progress_bar_config(disable=None)
prompt = "."
generator = torch.manual_seed(0)
output = sag_pipe(
[prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
)
image = output.images
image_slice = image[0, -3:, -3:, -1]
assert image.shape == (1, 512, 512, 3)
expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371])
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
def test_stable_diffusion_2_non_square(self):
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
sag_pipe = sag_pipe.to(torch_device)
sag_pipe.set_progress_bar_config(disable=None)
prompt = "."
generator = torch.manual_seed(0)
output = sag_pipe(
[prompt],
width=768,
height=512,
generator=generator,
guidance_scale=7.5,
sag_scale=1.0,
num_inference_steps=20,
output_type="np",
)
image = output.images
assert image.shape == (1, 512, 768, 3)

View File

@@ -1,231 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
load_numpy,
numpy_cosine_similarity_distance,
require_torch_accelerator,
skip_mps,
slow,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
enable_full_determinism()
@skip_mps
class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase):
pipeline_class = TextToVideoSDPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
# No `output_type`.
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback",
"callback_steps",
]
)
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet3DConditionModel(
block_out_channels=(8, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
cross_attention_dim=4,
attention_head_dim=4,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=(8,),
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D"],
latent_channels=4,
sample_size=32,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=4,
intermediate_size=16,
layer_norm_eps=1e-05,
num_attention_heads=2,
num_hidden_layers=2,
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
projection_dim=32,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "pt",
}
return inputs
def test_dict_tuple_outputs_equivalent(self):
return super().test_dict_tuple_outputs_equivalent()
def test_text_to_video_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = TextToVideoSDPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["output_type"] = "np"
frames = sd_pipe(**inputs).frames
image_slice = frames[0][0][-3:, -3:, -1]
assert frames[0][0].shape == (32, 32, 3)
expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.")
def test_attention_slicing_forward_pass(self):
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_consistent(self):
pass
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_single_identical(self):
pass
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
def test_num_images_per_prompt(self):
pass
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"num_images_per_prompt": 1,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@slow
@skip_mps
@require_torch_accelerator
class TextToVideoSDPipelineSlowTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_two_step_model(self):
expected_video = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/video_2step.npy"
)
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
pipe = pipe.to(torch_device)
prompt = "Spiderman is surfing"
generator = torch.Generator(device="cpu").manual_seed(0)
video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
assert numpy_cosine_similarity_distance(expected_video.flatten(), video_frames.flatten()) < 1e-4
def test_two_step_model_with_freeu(self):
expected_video = []
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
pipe = pipe.to(torch_device)
prompt = "Spiderman is surfing"
generator = torch.Generator(device="cpu").manual_seed(0)
pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
video = video_frames[0, 0, -3:, -3:, -1].flatten()
expected_video = [0.3643, 0.3455, 0.3831, 0.3923, 0.2978, 0.3247, 0.3278, 0.3201, 0.3475]
assert np.abs(expected_video - video).mean() < 5e-2

View File

@@ -1,62 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import torch
from diffusers import DDIMScheduler, TextToVideoZeroPipeline
from diffusers.utils.testing_utils import (
backend_empty_cache,
load_pt,
nightly,
require_torch_accelerator,
torch_device,
)
from ..test_pipelines_common import assert_mean_pixel_difference
@nightly
@require_torch_accelerator
class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_full_model(self):
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "A bear is playing a guitar on Times Square"
result = pipe(prompt=prompt, generator=generator).images
expected_result = load_pt(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt",
weights_only=False,
)
assert_mean_pixel_difference(result, expected_result)

View File

@@ -1,403 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import inspect
import tempfile
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
nightly,
require_accelerate_version_greater,
require_torch_accelerator,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineTesterMixin
enable_full_determinism()
def to_np(tensor):
if isinstance(tensor, torch.Tensor):
tensor = tensor.detach().cpu().numpy()
return tensor
class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
pipeline_class = TextToVideoZeroSDXLPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
generator_device = "cpu"
def get_dummy_components(self, seed=0):
torch.manual_seed(seed)
unet = UNet2DConditionModel(
block_out_channels=(2, 4),
layers_per_block=2,
sample_size=2,
norm_num_groups=2,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
# SD2-specific config below
attention_head_dim=(2, 4),
use_linear_projection=True,
addition_embed_type="text_time",
addition_time_embed_dim=8,
transformer_layers_per_block=(1, 2),
projection_class_embeddings_input_dim=80, # 6 * 8 + 32
cross_attention_dim=64,
)
scheduler = DDIMScheduler(
num_train_timesteps=1000,
beta_start=0.0001,
beta_end=0.02,
beta_schedule="linear",
trained_betas=None,
clip_sample=True,
set_alpha_to_one=True,
steps_offset=0,
prediction_type="epsilon",
thresholding=False,
dynamic_thresholding_ratio=0.995,
clip_sample_range=1.0,
sample_max_value=1.0,
timestep_spacing="leading",
rescale_betas_zero_snr=False,
)
torch.manual_seed(seed)
vae = AutoencoderKL(
block_out_channels=[32, 64],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
sample_size=128,
)
torch.manual_seed(seed)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
# SD2-specific config below
hidden_act="gelu",
projection_dim=32,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"text_encoder_2": text_encoder_2,
"tokenizer_2": tokenizer_2,
"image_encoder": None,
"feature_extractor": None,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A panda dancing in Antarctica",
"generator": generator,
"num_inference_steps": 5,
"t0": 1,
"t1": 3,
"height": 64,
"width": 64,
"video_length": 3,
"output_type": "np",
}
return inputs
def get_generator(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
return generator
def test_text_to_video_zero_sdxl(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
inputs = self.get_dummy_inputs(self.generator_device)
result = pipe(**inputs).images
first_frame_slice = result[0, -3:, -3:, -1]
last_frame_slice = result[-1, -3:, -3:, 0]
expected_slice1 = np.array(
[0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788]
)
expected_slice2 = np.array(
[0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245]
)
assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_attention_slicing_forward_pass(self):
pass
def test_cfg(self):
sig = inspect.signature(self.pipeline_class.__call__)
if "guidance_scale" not in sig.parameters:
return
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(self.generator_device)
inputs["guidance_scale"] = 1.0
out_no_cfg = pipe(**inputs)[0]
inputs["guidance_scale"] = 7.5
out_cfg = pipe(**inputs)[0]
assert out_cfg.shape == out_no_cfg.shape
def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
output = pipe(**self.get_dummy_inputs(self.generator_device))[0]
output_tuple = pipe(**self.get_dummy_inputs(self.generator_device), return_dict=False)[0]
max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
self.assertLess(max_diff, expected_max_difference)
@unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
@require_torch_accelerator
def test_float16_inference(self, expected_max_diff=5e-2):
components = self.get_dummy_components()
for name, module in components.items():
if hasattr(module, "half"):
components[name] = module.to(torch_device).half()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
components = self.get_dummy_components()
pipe_fp16 = self.pipeline_class(**components)
pipe_fp16.to(torch_device, torch.float16)
pipe_fp16.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(self.generator_device)
# # Reset generator in case it is used inside dummy inputs
if "generator" in inputs:
inputs["generator"] = self.get_generator(self.generator_device)
output = pipe(**inputs)[0]
fp16_inputs = self.get_dummy_inputs(self.generator_device)
# Reset generator in case it is used inside dummy inputs
if "generator" in fp16_inputs:
fp16_inputs["generator"] = self.get_generator(self.generator_device)
output_fp16 = pipe_fp16(**fp16_inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_consistent(self):
pass
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_inference_batch_single_identical(self):
pass
@require_torch_accelerator
@require_accelerate_version_greater("0.17.0")
def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(self.generator_device)
output_without_offload = pipe(**inputs)[0]
pipe.enable_model_cpu_offload(device=torch_device)
inputs = self.get_dummy_inputs(self.generator_device)
output_with_offload = pipe(**inputs)[0]
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
def test_pipeline_call_signature(self):
pass
@unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
@require_torch_accelerator
def test_save_load_float16(self, expected_max_diff=1e-2):
components = self.get_dummy_components()
for name, module in components.items():
if hasattr(module, "half"):
components[name] = module.to(torch_device).half()
pipe = self.pipeline_class(**components)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(self.generator_device)
output = pipe(**inputs)[0]
with tempfile.TemporaryDirectory() as tmpdir:
pipe.save_pretrained(tmpdir)
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
pipe_loaded.to(torch_device)
pipe_loaded.set_progress_bar_config(disable=None)
for name, component in pipe_loaded.components.items():
if hasattr(component, "dtype"):
self.assertTrue(
component.dtype == torch.float16,
f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
)
inputs = self.get_dummy_inputs(self.generator_device)
output_loaded = pipe_loaded(**inputs)[0]
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
self.assertLess(
max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
)
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_save_load_local(self):
pass
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_save_load_optional_components(self):
pass
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_sequential_cpu_offload_forward_pass(self):
pass
@require_torch_accelerator
def test_to_device(self):
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe.set_progress_bar_config(disable=None)
pipe.to("cpu")
model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
self.assertTrue(all(device == "cpu" for device in model_devices))
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu
self.assertTrue(np.isnan(output_cpu).sum() == 0)
pipe.to(torch_device)
model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
self.assertTrue(all(device == torch_device for device in model_devices))
output_device = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
@unittest.skip(
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
)
def test_xformers_attention_forwardGenerator_pass(self):
pass
@nightly
@require_torch_accelerator
class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_full_model(self):
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
)
pipe.enable_model_cpu_offload()
pipe.enable_vae_slicing()
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
generator = torch.Generator(device="cpu").manual_seed(0)
prompt = "A panda dancing in Antarctica"
result = pipe(prompt=prompt, generator=generator).images
first_frame_slice = result[0, -3:, -3:, -1]
last_frame_slice = result[-1, -3:, -3:, 0]
expected_slice1 = np.array([0.57, 0.57, 0.57, 0.57, 0.57, 0.56, 0.55, 0.56, 0.56])
expected_slice2 = np.array([0.54, 0.53, 0.53, 0.53, 0.53, 0.52, 0.53, 0.53, 0.53])
assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2

View File

@@ -1,229 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
DDIMScheduler,
UNet3DConditionModel,
VideoToVideoSDPipeline,
)
from diffusers.utils import is_xformers_available
from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
is_flaky,
nightly,
numpy_cosine_similarity_distance,
skip_mps,
torch_device,
)
from ..pipeline_params import (
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
)
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
@skip_mps
class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = VideoToVideoSDPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
test_attention_slicing = False
# No `output_type`.
required_optional_params = frozenset(
[
"num_inference_steps",
"generator",
"latents",
"return_dict",
"callback",
"callback_steps",
]
)
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet3DConditionModel(
block_out_channels=(4, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
cross_attention_dim=32,
attention_head_dim=4,
norm_num_groups=2,
)
scheduler = DDIMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=True,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[
8,
],
in_channels=3,
out_channels=3,
down_block_types=[
"DownEncoderBlock2D",
],
up_block_types=["UpDecoderBlock2D"],
latent_channels=4,
sample_size=32,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
hidden_act="gelu",
projection_dim=512,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
}
return components
def get_dummy_inputs(self, device, seed=0):
# 3 frames
video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device)
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"video": video,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "pt",
}
return inputs
def test_text_to_video_default_case(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
sd_pipe = VideoToVideoSDPipeline(**components)
sd_pipe = sd_pipe.to(device)
sd_pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["output_type"] = "np"
frames = sd_pipe(**inputs).frames
image_slice = frames[0][0][-3:, -3:, -1]
assert frames[0][0].shape == (32, 32, 3)
expected_slice = np.array([0.6391, 0.5350, 0.5202, 0.5521, 0.5453, 0.5393, 0.6652, 0.5270, 0.5185])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@is_flaky()
def test_save_load_optional_components(self):
super().test_save_load_optional_components(expected_max_difference=0.001)
@is_flaky()
def test_dict_tuple_outputs_equivalent(self):
super().test_dict_tuple_outputs_equivalent()
@is_flaky()
def test_save_load_local(self):
super().test_save_load_local()
@unittest.skipIf(
torch_device != "cuda" or not is_xformers_available(),
reason="XFormers attention is only available with CUDA and `xformers` installed",
)
def test_xformers_attention_forwardGenerator_pass(self):
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3)
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_consistent(self):
pass
# (todo): sayakpaul
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
def test_inference_batch_single_identical(self):
pass
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
def test_num_images_per_prompt(self):
pass
def test_encode_prompt_works_in_isolation(self):
extra_required_param_value_dict = {
"device": torch.device(torch_device).type,
"num_images_per_prompt": 1,
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
}
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
@nightly
@skip_mps
class VideoToVideoSDPipelineSlowTests(unittest.TestCase):
def test_two_step_model(self):
pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
# 10 frames
generator = torch.Generator(device="cpu").manual_seed(0)
video = torch.randn((1, 10, 3, 320, 576), generator=generator)
prompt = "Spiderman is surfing"
generator = torch.Generator(device="cpu").manual_seed(0)
video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="np").frames
expected_array = np.array(
[0.17114258, 0.13720703, 0.08886719, 0.14819336, 0.1730957, 0.24584961, 0.22021484, 0.35180664, 0.2607422]
)
output_array = video_frames[0, 0, :3, :3, 0].flatten()
assert numpy_cosine_similarity_distance(expected_array, output_array) < 1e-3

View File

@@ -1,523 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
backend_max_memory_allocated,
backend_reset_max_memory_allocated,
backend_reset_peak_memory_stats,
enable_full_determinism,
load_numpy,
nightly,
require_torch_accelerator,
skip_mps,
torch_device,
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
enable_full_determinism()
class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = UnCLIPPipeline
params = TEXT_TO_IMAGE_PARAMS - {
"negative_prompt",
"height",
"width",
"negative_prompt_embeds",
"guidance_scale",
"prompt_embeds",
"cross_attention_kwargs",
}
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
required_optional_params = [
"generator",
"return_dict",
"prior_num_inference_steps",
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
test_xformers_attention = False
@property
def text_embedder_hidden_size(self):
return 32
@property
def time_input_dim(self):
return 32
@property
def block_out_channels_0(self):
return self.time_input_dim
@property
def time_embed_dim(self):
return self.time_input_dim * 4
@property
def cross_attention_dim(self):
return 100
@property
def dummy_tokenizer(self):
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=self.text_embedder_hidden_size,
projection_dim=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModelWithProjection(config)
@property
def dummy_prior(self):
torch.manual_seed(0)
model_kwargs = {
"num_attention_heads": 2,
"attention_head_dim": 12,
"embedding_dim": self.text_embedder_hidden_size,
"num_layers": 1,
}
model = PriorTransformer(**model_kwargs)
return model
@property
def dummy_text_proj(self):
torch.manual_seed(0)
model_kwargs = {
"clip_embeddings_dim": self.text_embedder_hidden_size,
"time_embed_dim": self.time_embed_dim,
"cross_attention_dim": self.cross_attention_dim,
}
model = UnCLIPTextProjModel(**model_kwargs)
return model
@property
def dummy_decoder(self):
torch.manual_seed(0)
model_kwargs = {
"sample_size": 32,
# RGB in channels
"in_channels": 3,
# Out channels is double in channels because predicts mean and variance
"out_channels": 6,
"down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
"up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
"layers_per_block": 1,
"cross_attention_dim": self.cross_attention_dim,
"attention_head_dim": 4,
"resnet_time_scale_shift": "scale_shift",
"class_embed_type": "identity",
}
model = UNet2DConditionModel(**model_kwargs)
return model
@property
def dummy_super_res_kwargs(self):
return {
"sample_size": 64,
"layers_per_block": 1,
"down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
"up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
"in_channels": 6,
"out_channels": 3,
}
@property
def dummy_super_res_first(self):
torch.manual_seed(0)
model = UNet2DModel(**self.dummy_super_res_kwargs)
return model
@property
def dummy_super_res_last(self):
# seeded differently to get different unet than `self.dummy_super_res_first`
torch.manual_seed(1)
model = UNet2DModel(**self.dummy_super_res_kwargs)
return model
def get_dummy_components(self):
prior = self.dummy_prior
decoder = self.dummy_decoder
text_proj = self.dummy_text_proj
text_encoder = self.dummy_text_encoder
tokenizer = self.dummy_tokenizer
super_res_first = self.dummy_super_res_first
super_res_last = self.dummy_super_res_last
prior_scheduler = UnCLIPScheduler(
variance_type="fixed_small_log",
prediction_type="sample",
num_train_timesteps=1000,
clip_sample_range=5.0,
)
decoder_scheduler = UnCLIPScheduler(
variance_type="learned_range",
prediction_type="epsilon",
num_train_timesteps=1000,
)
super_res_scheduler = UnCLIPScheduler(
variance_type="fixed_small_log",
prediction_type="epsilon",
num_train_timesteps=1000,
)
components = {
"prior": prior,
"decoder": decoder,
"text_proj": text_proj,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"super_res_first": super_res_first,
"super_res_last": super_res_last,
"prior_scheduler": prior_scheduler,
"decoder_scheduler": decoder_scheduler,
"super_res_scheduler": super_res_scheduler,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "horse",
"generator": generator,
"prior_num_inference_steps": 2,
"decoder_num_inference_steps": 2,
"super_res_num_inference_steps": 2,
"output_type": "np",
}
return inputs
def test_unclip(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output = pipe(**self.get_dummy_inputs(device))
image = output.images
image_from_tuple = pipe(
**self.get_dummy_inputs(device),
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array(
[
0.9997,
0.9988,
0.0028,
0.9997,
0.9984,
0.9965,
0.0029,
0.9986,
0.0025,
]
)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_unclip_passed_text_embed(self):
device = torch.device("cpu")
class DummyScheduler:
init_noise_sigma = 1
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
prior = components["prior"]
decoder = components["decoder"]
super_res_first = components["super_res_first"]
tokenizer = components["tokenizer"]
text_encoder = components["text_encoder"]
generator = torch.Generator(device=device).manual_seed(0)
dtype = prior.dtype
batch_size = 1
shape = (batch_size, prior.config.embedding_dim)
prior_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
generator = torch.Generator(device=device).manual_seed(0)
decoder_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
shape = (
batch_size,
super_res_first.config.in_channels // 2,
super_res_first.config.sample_size,
super_res_first.config.sample_size,
)
super_res_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
pipe.set_progress_bar_config(disable=None)
prompt = "this is a prompt example"
generator = torch.Generator(device=device).manual_seed(0)
output = pipe(
[prompt],
generator=generator,
prior_num_inference_steps=2,
decoder_num_inference_steps=2,
super_res_num_inference_steps=2,
prior_latents=prior_latents,
decoder_latents=decoder_latents,
super_res_latents=super_res_latents,
output_type="np",
)
image = output.images
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=tokenizer.model_max_length,
return_tensors="pt",
)
text_model_output = text_encoder(text_inputs.input_ids)
text_attention_mask = text_inputs.attention_mask
generator = torch.Generator(device=device).manual_seed(0)
image_from_text = pipe(
generator=generator,
prior_num_inference_steps=2,
decoder_num_inference_steps=2,
super_res_num_inference_steps=2,
prior_latents=prior_latents,
decoder_latents=decoder_latents,
super_res_latents=super_res_latents,
text_model_output=text_model_output,
text_attention_mask=text_attention_mask,
output_type="np",
)[0]
# make sure passing text embeddings manually is identical
assert np.abs(image - image_from_text).max() < 1e-4
# Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
# because UnCLIP GPU undeterminism requires a looser check.
@skip_mps
def test_attention_slicing_forward_pass(self):
test_max_difference = torch_device == "cpu"
self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
# Overriding PipelineTesterMixin::test_inference_batch_single_identical
# because UnCLIP undeterminism requires a looser check.
@skip_mps
def test_inference_batch_single_identical(self):
additional_params_copy_to_batched_inputs = [
"prior_num_inference_steps",
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
self._test_inference_batch_single_identical(
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=9.8e-3
)
def test_inference_batch_consistent(self):
additional_params_copy_to_batched_inputs = [
"prior_num_inference_steps",
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
if torch_device == "mps":
# TODO: MPS errors with larger batch sizes
batch_sizes = [2, 3]
self._test_inference_batch_consistent(
batch_sizes=batch_sizes,
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
)
else:
self._test_inference_batch_consistent(
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
)
@skip_mps
def test_dict_tuple_outputs_equivalent(self):
return super().test_dict_tuple_outputs_equivalent()
@skip_mps
def test_save_load_local(self):
return super().test_save_load_local(expected_max_difference=5e-3)
@skip_mps
def test_save_load_optional_components(self):
return super().test_save_load_optional_components()
@unittest.skip("UnCLIP produces very large differences in fp16 vs fp32. Test is not useful.")
def test_float16_inference(self):
super().test_float16_inference(expected_max_diff=1.0)
@nightly
class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_unclip_karlo_cpu_fp32(self):
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/unclip/karlo_v1_alpha_horse_cpu.npy"
)
pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
pipeline.set_progress_bar_config(disable=None)
generator = torch.manual_seed(0)
output = pipeline(
"horse",
num_images_per_prompt=1,
generator=generator,
output_type="np",
)
image = output.images[0]
assert image.shape == (256, 256, 3)
assert np.abs(expected_image - image).max() < 1e-1
@nightly
@require_torch_accelerator
class UnCLIPPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_unclip_karlo(self):
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/unclip/karlo_v1_alpha_horse_fp16.npy"
)
pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
pipeline = pipeline.to(torch_device)
pipeline.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
output = pipeline(
"horse",
generator=generator,
output_type="np",
)
image = output.images[0]
assert image.shape == (256, 256, 3)
assert_mean_pixel_difference(image, expected_image)
def test_unclip_pipeline_with_sequential_cpu_offloading(self):
backend_empty_cache(torch_device)
backend_reset_max_memory_allocated(torch_device)
backend_reset_peak_memory_stats(torch_device)
pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
pipe.enable_sequential_cpu_offload()
_ = pipe(
"horse",
num_images_per_prompt=1,
prior_num_inference_steps=2,
decoder_num_inference_steps=2,
super_res_num_inference_steps=2,
output_type="np",
)
mem_bytes = backend_max_memory_allocated(torch_device)
# make sure that less than 7 GB is allocated
assert mem_bytes < 7 * 10**9

View File

@@ -1,540 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import random
import unittest
import numpy as np
import torch
from transformers import (
CLIPImageProcessor,
CLIPTextConfig,
CLIPTextModelWithProjection,
CLIPTokenizer,
CLIPVisionConfig,
CLIPVisionModelWithProjection,
)
from diffusers import (
DiffusionPipeline,
UnCLIPImageVariationPipeline,
UnCLIPScheduler,
UNet2DConditionModel,
UNet2DModel,
)
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
load_numpy,
nightly,
require_torch_accelerator,
skip_mps,
torch_device,
)
from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
enable_full_determinism()
class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = UnCLIPImageVariationPipeline
params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
batch_params = IMAGE_VARIATION_BATCH_PARAMS
required_optional_params = [
"generator",
"return_dict",
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
test_xformers_attention = False
supports_dduf = False
@property
def text_embedder_hidden_size(self):
return 32
@property
def time_input_dim(self):
return 32
@property
def block_out_channels_0(self):
return self.time_input_dim
@property
def time_embed_dim(self):
return self.time_input_dim * 4
@property
def cross_attention_dim(self):
return 100
@property
def dummy_tokenizer(self):
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=self.text_embedder_hidden_size,
projection_dim=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModelWithProjection(config)
@property
def dummy_image_encoder(self):
torch.manual_seed(0)
config = CLIPVisionConfig(
hidden_size=self.text_embedder_hidden_size,
projection_dim=self.text_embedder_hidden_size,
num_hidden_layers=5,
num_attention_heads=4,
image_size=32,
intermediate_size=37,
patch_size=1,
)
return CLIPVisionModelWithProjection(config)
@property
def dummy_text_proj(self):
torch.manual_seed(0)
model_kwargs = {
"clip_embeddings_dim": self.text_embedder_hidden_size,
"time_embed_dim": self.time_embed_dim,
"cross_attention_dim": self.cross_attention_dim,
}
model = UnCLIPTextProjModel(**model_kwargs)
return model
@property
def dummy_decoder(self):
torch.manual_seed(0)
model_kwargs = {
"sample_size": 32,
# RGB in channels
"in_channels": 3,
# Out channels is double in channels because predicts mean and variance
"out_channels": 6,
"down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
"up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
"layers_per_block": 1,
"cross_attention_dim": self.cross_attention_dim,
"attention_head_dim": 4,
"resnet_time_scale_shift": "scale_shift",
"class_embed_type": "identity",
}
model = UNet2DConditionModel(**model_kwargs)
return model
@property
def dummy_super_res_kwargs(self):
return {
"sample_size": 64,
"layers_per_block": 1,
"down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
"up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
"in_channels": 6,
"out_channels": 3,
}
@property
def dummy_super_res_first(self):
torch.manual_seed(0)
model = UNet2DModel(**self.dummy_super_res_kwargs)
return model
@property
def dummy_super_res_last(self):
# seeded differently to get different unet than `self.dummy_super_res_first`
torch.manual_seed(1)
model = UNet2DModel(**self.dummy_super_res_kwargs)
return model
def get_dummy_components(self):
decoder = self.dummy_decoder
text_proj = self.dummy_text_proj
text_encoder = self.dummy_text_encoder
tokenizer = self.dummy_tokenizer
super_res_first = self.dummy_super_res_first
super_res_last = self.dummy_super_res_last
decoder_scheduler = UnCLIPScheduler(
variance_type="learned_range",
prediction_type="epsilon",
num_train_timesteps=1000,
)
super_res_scheduler = UnCLIPScheduler(
variance_type="fixed_small_log",
prediction_type="epsilon",
num_train_timesteps=1000,
)
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
image_encoder = self.dummy_image_encoder
return {
"decoder": decoder,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"text_proj": text_proj,
"feature_extractor": feature_extractor,
"image_encoder": image_encoder,
"super_res_first": super_res_first,
"super_res_last": super_res_last,
"decoder_scheduler": decoder_scheduler,
"super_res_scheduler": super_res_scheduler,
}
def get_dummy_inputs(self, device, seed=0, pil_image=True):
input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
if pil_image:
input_image = input_image * 0.5 + 0.5
input_image = input_image.clamp(0, 1)
input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
return {
"image": input_image,
"generator": generator,
"decoder_num_inference_steps": 2,
"super_res_num_inference_steps": 2,
"output_type": "np",
}
def test_unclip_image_variation_input_tensor(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
output = pipe(**pipeline_inputs)
image = output.images
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
image_from_tuple = pipe(
**tuple_pipeline_inputs,
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array(
[
0.9997,
0.0002,
0.9997,
0.9997,
0.9969,
0.0023,
0.9997,
0.9969,
0.9970,
]
)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_unclip_image_variation_input_image(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
output = pipe(**pipeline_inputs)
image = output.images
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
image_from_tuple = pipe(
**tuple_pipeline_inputs,
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_unclip_image_variation_input_list_images(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
pipeline_inputs["image"] = [
pipeline_inputs["image"],
pipeline_inputs["image"],
]
output = pipe(**pipeline_inputs)
image = output.images
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
tuple_pipeline_inputs["image"] = [
tuple_pipeline_inputs["image"],
tuple_pipeline_inputs["image"],
]
image_from_tuple = pipe(
**tuple_pipeline_inputs,
return_dict=False,
)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (2, 64, 64, 3)
expected_slice = np.array(
[
0.9997,
0.9989,
0.0008,
0.0021,
0.9960,
0.0018,
0.0014,
0.0002,
0.9933,
]
)
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
def test_unclip_passed_image_embed(self):
device = torch.device("cpu")
class DummyScheduler:
init_noise_sigma = 1
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
generator = torch.Generator(device=device).manual_seed(0)
dtype = pipe.decoder.dtype
batch_size = 1
shape = (
batch_size,
pipe.decoder.config.in_channels,
pipe.decoder.config.sample_size,
pipe.decoder.config.sample_size,
)
decoder_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
shape = (
batch_size,
pipe.super_res_first.config.in_channels // 2,
pipe.super_res_first.config.sample_size,
pipe.super_res_first.config.sample_size,
)
generator = torch.Generator(device=device).manual_seed(0)
super_res_latents = pipe.prepare_latents(
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
)
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
img_out_1 = pipe(
**pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents
).images
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
# Don't pass image, instead pass embedding
image = pipeline_inputs.pop("image")
image_embeddings = pipe.image_encoder(image).image_embeds
img_out_2 = pipe(
**pipeline_inputs,
decoder_latents=decoder_latents,
super_res_latents=super_res_latents,
image_embeddings=image_embeddings,
).images
# make sure passing text embeddings manually is identical
assert np.abs(img_out_1 - img_out_2).max() < 1e-4
# Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
# because UnCLIP GPU undeterminism requires a looser check.
@skip_mps
def test_attention_slicing_forward_pass(self):
test_max_difference = torch_device == "cpu"
# Check is relaxed because there is not a torch 2.0 sliced attention added kv processor
expected_max_diff = 1e-2
self._test_attention_slicing_forward_pass(
test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
)
# Overriding PipelineTesterMixin::test_inference_batch_single_identical
# because UnCLIP undeterminism requires a looser check.
@unittest.skip("UnCLIP produces very large differences. Test is not useful.")
@skip_mps
def test_inference_batch_single_identical(self):
additional_params_copy_to_batched_inputs = [
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
self._test_inference_batch_single_identical(
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
)
def test_inference_batch_consistent(self):
additional_params_copy_to_batched_inputs = [
"decoder_num_inference_steps",
"super_res_num_inference_steps",
]
if torch_device == "mps":
# TODO: MPS errors with larger batch sizes
batch_sizes = [2, 3]
self._test_inference_batch_consistent(
batch_sizes=batch_sizes,
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
)
else:
self._test_inference_batch_consistent(
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
)
@skip_mps
def test_dict_tuple_outputs_equivalent(self):
return super().test_dict_tuple_outputs_equivalent()
@unittest.skip("UnCLIP produces very large difference. Test is not useful.")
@skip_mps
def test_save_load_local(self):
return super().test_save_load_local(expected_max_difference=4e-3)
@skip_mps
def test_save_load_optional_components(self):
return super().test_save_load_optional_components()
@unittest.skip("UnCLIP produces very large difference in fp16 vs fp32. Test is not useful.")
def test_float16_inference(self):
super().test_float16_inference(expected_max_diff=1.0)
@nightly
@require_torch_accelerator
class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
def setUp(self):
# clean up the VRAM before each test
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
# clean up the VRAM after each test
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def test_unclip_image_variation_karlo(self):
input_image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
)
expected_image = load_numpy(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
)
pipeline = UnCLIPImageVariationPipeline.from_pretrained(
"kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16
)
pipeline = pipeline.to(torch_device)
pipeline.set_progress_bar_config(disable=None)
generator = torch.Generator(device="cpu").manual_seed(0)
output = pipeline(
input_image,
generator=generator,
output_type="np",
)
image = output.images[0]
assert image.shape == (256, 256, 3)
assert_mean_pixel_difference(image, expected_image, 15)

View File

@@ -1,764 +0,0 @@
import gc
import random
import unittest
import numpy as np
import torch
from PIL import Image
from transformers import (
CLIPImageProcessor,
CLIPTextModel,
CLIPTokenizer,
CLIPVisionModelWithProjection,
GPT2Tokenizer,
)
from diffusers import (
AutoencoderKL,
DPMSolverMultistepScheduler,
UniDiffuserModel,
UniDiffuserPipeline,
UniDiffuserTextDecoder,
)
from diffusers.utils.testing_utils import (
backend_empty_cache,
enable_full_determinism,
floats_tensor,
load_image,
nightly,
require_torch_accelerator,
torch_device,
)
from diffusers.utils.torch_utils import randn_tensor
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
)
from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
enable_full_determinism()
class UniDiffuserPipelineFastTests(
PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
):
pipeline_class = UniDiffuserPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
# vae_latents, not latents, is the argument that corresponds to VAE latent inputs
image_latents_params = frozenset(["vae_latents"])
supports_dduf = False
def get_dummy_components(self):
unet = UniDiffuserModel.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="unet",
)
scheduler = DPMSolverMultistepScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
solver_order=3,
)
vae = AutoencoderKL.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="vae",
)
text_encoder = CLIPTextModel.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="text_encoder",
)
clip_tokenizer = CLIPTokenizer.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="clip_tokenizer",
)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_encoder",
)
# From the Stable Diffusion Image Variation pipeline tests
clip_image_processor = CLIPImageProcessor(crop_size=32, size=32)
# image_processor = CLIPImageProcessor.from_pretrained("hf-internal-testing/tiny-random-clip")
text_tokenizer = GPT2Tokenizer.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="text_tokenizer",
)
text_decoder = UniDiffuserTextDecoder.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="text_decoder",
)
components = {
"vae": vae,
"text_encoder": text_encoder,
"image_encoder": image_encoder,
"clip_image_processor": clip_image_processor,
"clip_tokenizer": clip_tokenizer,
"text_decoder": text_decoder,
"text_tokenizer": text_tokenizer,
"unet": unet,
"scheduler": scheduler,
}
return components
def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image.cpu().permute(0, 2, 3, 1)[0]
image = Image.fromarray(np.uint8(image)).convert("RGB")
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "an elephant under the sea",
"image": image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def get_fixed_latents(self, device, seed=0):
if isinstance(device, str):
device = torch.device(device)
generator = torch.Generator(device=device).manual_seed(seed)
# Hardcode the shapes for now.
prompt_latents = randn_tensor((1, 77, 32), generator=generator, device=device, dtype=torch.float32)
vae_latents = randn_tensor((1, 4, 16, 16), generator=generator, device=device, dtype=torch.float32)
clip_latents = randn_tensor((1, 1, 32), generator=generator, device=device, dtype=torch.float32)
latents = {
"prompt_latents": prompt_latents,
"vae_latents": vae_latents,
"clip_latents": clip_latents,
}
return latents
def get_dummy_inputs_with_latents(self, device, seed=0):
# image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
# image = image.cpu().permute(0, 2, 3, 1)[0]
# image = Image.fromarray(np.uint8(image)).convert("RGB")
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg",
)
image = image.resize((32, 32))
latents = self.get_fixed_latents(device, seed=seed)
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "an elephant under the sea",
"image": image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
"prompt_latents": latents.get("prompt_latents"),
"vae_latents": latents.get("vae_latents"),
"clip_latents": latents.get("clip_latents"),
}
return inputs
def test_dict_tuple_outputs_equivalent(self):
expected_slice = None
if torch_device == "cpu":
expected_slice = np.array([0.7489, 0.3722, 0.4475, 0.5630, 0.5923, 0.4992, 0.3936, 0.5844, 0.4975])
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
def test_unidiffuser_default_joint_v0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'joint'
unidiffuser_pipe.set_joint_mode()
assert unidiffuser_pipe.mode == "joint"
# inputs = self.get_dummy_inputs(device)
inputs = self.get_dummy_inputs_with_latents(device)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
sample = unidiffuser_pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_default_joint_no_cfg_v0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'joint'
unidiffuser_pipe.set_joint_mode()
assert unidiffuser_pipe.mode == "joint"
# inputs = self.get_dummy_inputs(device)
inputs = self.get_dummy_inputs_with_latents(device)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
# Set guidance scale to 1.0 to turn off CFG
inputs["guidance_scale"] = 1.0
sample = unidiffuser_pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_default_text2img_v0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'text2img'
unidiffuser_pipe.set_text_to_image_mode()
assert unidiffuser_pipe.mode == "text2img"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete image for text-conditioned image generation
del inputs["image"]
image = unidiffuser_pipe(**inputs).images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_unidiffuser_default_image_0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img'
unidiffuser_pipe.set_image_mode()
assert unidiffuser_pipe.mode == "img"
inputs = self.get_dummy_inputs(device)
# Delete prompt and image for unconditional ("marginal") text generation.
del inputs["prompt"]
del inputs["image"]
image = unidiffuser_pipe(**inputs).images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5760, 0.6270, 0.6571, 0.4966, 0.4638, 0.5663, 0.5254, 0.5068, 0.5715])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_unidiffuser_default_text_v0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img'
unidiffuser_pipe.set_text_mode()
assert unidiffuser_pipe.mode == "text"
inputs = self.get_dummy_inputs(device)
# Delete prompt and image for unconditional ("marginal") text generation.
del inputs["prompt"]
del inputs["image"]
text = unidiffuser_pipe(**inputs).text
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_default_img2text_v0(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img2text'
unidiffuser_pipe.set_image_to_text_mode()
assert unidiffuser_pipe.mode == "img2text"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete text for image-conditioned text generation
del inputs["prompt"]
text = unidiffuser_pipe(**inputs).text
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_default_joint_v1(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'joint'
unidiffuser_pipe.set_joint_mode()
assert unidiffuser_pipe.mode == "joint"
# inputs = self.get_dummy_inputs(device)
inputs = self.get_dummy_inputs_with_latents(device)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
inputs["data_type"] = 1
sample = unidiffuser_pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_default_text2img_v1(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'text2img'
unidiffuser_pipe.set_text_to_image_mode()
assert unidiffuser_pipe.mode == "text2img"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete image for text-conditioned image generation
del inputs["image"]
image = unidiffuser_pipe(**inputs).images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_unidiffuser_default_img2text_v1(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img2text'
unidiffuser_pipe.set_image_to_text_mode()
assert unidiffuser_pipe.mode == "img2text"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete text for image-conditioned text generation
del inputs["prompt"]
text = unidiffuser_pipe(**inputs).text
expected_text_prefix = " no no no "
assert text[0][:10] == expected_text_prefix
def test_unidiffuser_text2img_multiple_images(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'text2img'
unidiffuser_pipe.set_text_to_image_mode()
assert unidiffuser_pipe.mode == "text2img"
inputs = self.get_dummy_inputs(device)
# Delete image for text-conditioned image generation
del inputs["image"]
inputs["num_images_per_prompt"] = 2
inputs["num_prompts_per_image"] = 3
image = unidiffuser_pipe(**inputs).images
assert image.shape == (2, 32, 32, 3)
def test_unidiffuser_img2text_multiple_prompts(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img2text'
unidiffuser_pipe.set_image_to_text_mode()
assert unidiffuser_pipe.mode == "img2text"
inputs = self.get_dummy_inputs(device)
# Delete text for image-conditioned text generation
del inputs["prompt"]
inputs["num_images_per_prompt"] = 2
inputs["num_prompts_per_image"] = 3
text = unidiffuser_pipe(**inputs).text
assert len(text) == 3
def test_unidiffuser_text2img_multiple_images_with_latents(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'text2img'
unidiffuser_pipe.set_text_to_image_mode()
assert unidiffuser_pipe.mode == "text2img"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete image for text-conditioned image generation
del inputs["image"]
inputs["num_images_per_prompt"] = 2
inputs["num_prompts_per_image"] = 3
image = unidiffuser_pipe(**inputs).images
assert image.shape == (2, 32, 32, 3)
def test_unidiffuser_img2text_multiple_prompts_with_latents(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
unidiffuser_pipe = UniDiffuserPipeline(**components)
unidiffuser_pipe = unidiffuser_pipe.to(device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img2text'
unidiffuser_pipe.set_image_to_text_mode()
assert unidiffuser_pipe.mode == "img2text"
inputs = self.get_dummy_inputs_with_latents(device)
# Delete text for image-conditioned text generation
del inputs["prompt"]
inputs["num_images_per_prompt"] = 2
inputs["num_prompts_per_image"] = 3
text = unidiffuser_pipe(**inputs).text
assert len(text) == 3
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=2e-4)
@require_torch_accelerator
def test_unidiffuser_default_joint_v1_fp16(self):
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
)
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'joint'
unidiffuser_pipe.set_joint_mode()
assert unidiffuser_pipe.mode == "joint"
inputs = self.get_dummy_inputs_with_latents(torch_device)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
inputs["data_type"] = 1
sample = unidiffuser_pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.5049, 0.5498, 0.5854, 0.3052, 0.4460, 0.6489, 0.5122, 0.4810, 0.6138])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
expected_text_prefix = '" This This'
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
@require_torch_accelerator
def test_unidiffuser_default_text2img_v1_fp16(self):
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
)
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'text2img'
unidiffuser_pipe.set_text_to_image_mode()
assert unidiffuser_pipe.mode == "text2img"
inputs = self.get_dummy_inputs_with_latents(torch_device)
# Delete prompt and image for joint inference.
del inputs["image"]
inputs["data_type"] = 1
sample = unidiffuser_pipe(**inputs)
image = sample.images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
@require_torch_accelerator
def test_unidiffuser_default_img2text_v1_fp16(self):
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
)
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
unidiffuser_pipe.set_progress_bar_config(disable=None)
# Set mode to 'img2text'
unidiffuser_pipe.set_image_to_text_mode()
assert unidiffuser_pipe.mode == "img2text"
inputs = self.get_dummy_inputs_with_latents(torch_device)
# Delete prompt and image for joint inference.
del inputs["prompt"]
inputs["data_type"] = 1
text = unidiffuser_pipe(**inputs).text
expected_text_prefix = '" This This'
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
@unittest.skip(
"Test not supported because it has a bunch of direct configs at init and also, this pipeline isn't used that much now."
)
def test_encode_prompt_works_in_isolation():
pass
@nightly
@require_torch_accelerator
class UniDiffuserPipelineSlowTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0, generate_latents=False):
generator = torch.manual_seed(seed)
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
)
inputs = {
"prompt": "an elephant under the sea",
"image": image,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 8.0,
"output_type": "np",
}
if generate_latents:
latents = self.get_fixed_latents(device, seed=seed)
for latent_name, latent_tensor in latents.items():
inputs[latent_name] = latent_tensor
return inputs
def get_fixed_latents(self, device, seed=0):
if isinstance(device, str):
device = torch.device(device)
latent_device = torch.device("cpu")
generator = torch.Generator(device=latent_device).manual_seed(seed)
# Hardcode the shapes for now.
prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
# Move latents onto desired device.
prompt_latents = prompt_latents.to(device)
vae_latents = vae_latents.to(device)
clip_latents = clip_latents.to(device)
latents = {
"prompt_latents": prompt_latents,
"vae_latents": vae_latents,
"clip_latents": clip_latents,
}
return latents
def test_unidiffuser_default_joint_v1(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
# inputs = self.get_dummy_inputs(device)
inputs = self.get_inputs(device=torch_device, generate_latents=True)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
sample = pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1
expected_text_prefix = "a living room"
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
def test_unidiffuser_default_text2img_v1(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(device=torch_device, generate_latents=True)
del inputs["image"]
sample = pipe(**inputs)
image = sample.images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
def test_unidiffuser_default_img2text_v1(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(device=torch_device, generate_latents=True)
del inputs["prompt"]
sample = pipe(**inputs)
text = sample.text
expected_text_prefix = "An astronaut"
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
@nightly
@require_torch_accelerator
class UniDiffuserPipelineNightlyTests(unittest.TestCase):
def setUp(self):
super().setUp()
gc.collect()
backend_empty_cache(torch_device)
def tearDown(self):
super().tearDown()
gc.collect()
backend_empty_cache(torch_device)
def get_inputs(self, device, seed=0, generate_latents=False):
generator = torch.manual_seed(seed)
image = load_image(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
)
inputs = {
"prompt": "an elephant under the sea",
"image": image,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 8.0,
"output_type": "np",
}
if generate_latents:
latents = self.get_fixed_latents(device, seed=seed)
for latent_name, latent_tensor in latents.items():
inputs[latent_name] = latent_tensor
return inputs
def get_fixed_latents(self, device, seed=0):
if isinstance(device, str):
device = torch.device(device)
latent_device = torch.device("cpu")
generator = torch.Generator(device=latent_device).manual_seed(seed)
# Hardcode the shapes for now.
prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
# Move latents onto desired device.
prompt_latents = prompt_latents.to(device)
vae_latents = vae_latents.to(device)
clip_latents = clip_latents.to(device)
latents = {
"prompt_latents": prompt_latents,
"vae_latents": vae_latents,
"clip_latents": clip_latents,
}
return latents
def test_unidiffuser_default_joint_v1_fp16(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
# inputs = self.get_dummy_inputs(device)
inputs = self.get_inputs(device=torch_device, generate_latents=True)
# Delete prompt and image for joint inference.
del inputs["prompt"]
del inputs["image"]
sample = pipe(**inputs)
image = sample.images
text = sample.text
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1]
expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 2e-1
expected_text_prefix = "a living room"
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
def test_unidiffuser_default_text2img_v1_fp16(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(device=torch_device, generate_latents=True)
del inputs["image"]
sample = pipe(**inputs)
image = sample.images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
def test_unidiffuser_default_img2text_v1_fp16(self):
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(device=torch_device, generate_latents=True)
del inputs["prompt"]
sample = pipe(**inputs)
text = sample.text
expected_text_prefix = "An astronaut"
assert text[0][: len(expected_text_prefix)] == expected_text_prefix

View File

@@ -1,241 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline
from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = WuerstchenCombinedPipeline
params = ["prompt"]
batch_params = ["prompt", "negative_prompt"]
required_optional_params = [
"generator",
"height",
"width",
"latents",
"prior_guidance_scale",
"decoder_guidance_scale",
"negative_prompt",
"num_inference_steps",
"return_dict",
"prior_num_inference_steps",
"output_type",
]
test_xformers_attention = True
@property
def text_embedder_hidden_size(self):
return 32
@property
def dummy_prior(self):
torch.manual_seed(0)
model_kwargs = {"c_in": 2, "c": 8, "depth": 2, "c_cond": 32, "c_r": 8, "nhead": 2}
model = WuerstchenPrior(**model_kwargs)
return model.eval()
@property
def dummy_tokenizer(self):
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
def dummy_prior_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config).eval()
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
projection_dim=self.text_embedder_hidden_size,
hidden_size=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config).eval()
@property
def dummy_vqgan(self):
torch.manual_seed(0)
model_kwargs = {
"bottleneck_blocks": 1,
"num_vq_embeddings": 2,
}
model = PaellaVQModel(**model_kwargs)
return model.eval()
@property
def dummy_decoder(self):
torch.manual_seed(0)
model_kwargs = {
"c_cond": self.text_embedder_hidden_size,
"c_hidden": [320],
"nhead": [-1],
"blocks": [4],
"level_config": ["CT"],
"clip_embd": self.text_embedder_hidden_size,
"inject_effnet": [False],
}
model = WuerstchenDiffNeXt(**model_kwargs)
return model.eval()
def get_dummy_components(self):
prior = self.dummy_prior
prior_text_encoder = self.dummy_prior_text_encoder
scheduler = DDPMWuerstchenScheduler()
tokenizer = self.dummy_tokenizer
text_encoder = self.dummy_text_encoder
decoder = self.dummy_decoder
vqgan = self.dummy_vqgan
components = {
"tokenizer": tokenizer,
"text_encoder": text_encoder,
"decoder": decoder,
"vqgan": vqgan,
"scheduler": scheduler,
"prior_prior": prior,
"prior_text_encoder": prior_text_encoder,
"prior_tokenizer": tokenizer,
"prior_scheduler": scheduler,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "horse",
"generator": generator,
"prior_guidance_scale": 4.0,
"decoder_guidance_scale": 4.0,
"num_inference_steps": 2,
"prior_num_inference_steps": 2,
"output_type": "np",
"height": 128,
"width": 128,
}
return inputs
def test_wuerstchen(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output = pipe(**self.get_dummy_inputs(device))
image = output.images
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[-3:, -3:, -1]
assert image.shape == (1, 128, 128, 3)
expected_slice = np.array([0.7616304, 0.0, 1.0, 0.0, 1.0, 0.0, 0.05925313, 0.0, 0.951898])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
)
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, (
f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
)
@require_torch_accelerator
def test_offloads(self):
pipes = []
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components).to(torch_device)
pipes.append(sd_pipe)
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe.enable_sequential_cpu_offload(device=torch_device)
pipes.append(sd_pipe)
components = self.get_dummy_components()
sd_pipe = self.pipeline_class(**components)
sd_pipe.enable_model_cpu_offload(device=torch_device)
pipes.append(sd_pipe)
image_slices = []
for pipe in pipes:
inputs = self.get_dummy_inputs(torch_device)
image = pipe(**inputs).images
image_slices.append(image[0, -3:, -3:, -1].flatten())
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=1e-2)
@unittest.skip(reason="flakey and float16 requires CUDA")
def test_float16_inference(self):
super().test_float16_inference()
@unittest.skip(reason="Test not supported.")
def test_callback_inputs(self):
pass
@unittest.skip(reason="Test not supported.")
def test_callback_cfg(self):
pass

View File

@@ -1,192 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import DDPMWuerstchenScheduler, WuerstchenDecoderPipeline
from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt
from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class WuerstchenDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = WuerstchenDecoderPipeline
params = ["prompt"]
batch_params = ["image_embeddings", "prompt", "negative_prompt"]
required_optional_params = [
"num_images_per_prompt",
"num_inference_steps",
"latents",
"negative_prompt",
"guidance_scale",
"output_type",
"return_dict",
]
test_xformers_attention = False
callback_cfg_params = ["image_embeddings", "text_encoder_hidden_states"]
@property
def text_embedder_hidden_size(self):
return 32
@property
def time_input_dim(self):
return 32
@property
def block_out_channels_0(self):
return self.time_input_dim
@property
def time_embed_dim(self):
return self.time_input_dim * 4
@property
def dummy_tokenizer(self):
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
projection_dim=self.text_embedder_hidden_size,
hidden_size=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config).eval()
@property
def dummy_vqgan(self):
torch.manual_seed(0)
model_kwargs = {
"bottleneck_blocks": 1,
"num_vq_embeddings": 2,
}
model = PaellaVQModel(**model_kwargs)
return model.eval()
@property
def dummy_decoder(self):
torch.manual_seed(0)
model_kwargs = {
"c_cond": self.text_embedder_hidden_size,
"c_hidden": [320],
"nhead": [-1],
"blocks": [4],
"level_config": ["CT"],
"clip_embd": self.text_embedder_hidden_size,
"inject_effnet": [False],
}
model = WuerstchenDiffNeXt(**model_kwargs)
return model.eval()
def get_dummy_components(self):
decoder = self.dummy_decoder
text_encoder = self.dummy_text_encoder
tokenizer = self.dummy_tokenizer
vqgan = self.dummy_vqgan
scheduler = DDPMWuerstchenScheduler()
components = {
"decoder": decoder,
"vqgan": vqgan,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"scheduler": scheduler,
"latent_dim_scale": 4.0,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"image_embeddings": torch.ones((1, 4, 4, 4), device=device),
"prompt": "horse",
"generator": generator,
"guidance_scale": 1.0,
"num_inference_steps": 2,
"output_type": "np",
}
return inputs
def test_wuerstchen_decoder(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output = pipe(**self.get_dummy_inputs(device))
image = output.images
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)
image_slice = image[0, -3:, -3:, -1]
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
assert image.shape == (1, 64, 64, 3)
expected_slice = np.array([0.0000, 0.0000, 0.0089, 1.0000, 1.0000, 0.3927, 1.0000, 1.0000, 1.0000])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
@skip_mps
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(expected_max_diff=1e-5)
@skip_mps
def test_attention_slicing_forward_pass(self):
test_max_difference = torch_device == "cpu"
test_mean_pixel_difference = False
self._test_attention_slicing_forward_pass(
test_max_difference=test_max_difference,
test_mean_pixel_difference=test_mean_pixel_difference,
)
@unittest.skip(reason="bf16 not supported and requires CUDA")
def test_float16_inference(self):
super().test_float16_inference()
@unittest.skip("Test not supported.")
def test_encode_prompt_works_in_isolation(self):
super().test_encode_prompt_works_in_isolation()

View File

@@ -1,273 +0,0 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline
from diffusers.pipelines.wuerstchen import WuerstchenPrior
from diffusers.utils.import_utils import is_peft_available
from diffusers.utils.testing_utils import enable_full_determinism, require_peft_backend, skip_mps, torch_device
if is_peft_available():
from peft import LoraConfig
from peft.tuners.tuners_utils import BaseTunerLayer
from ..test_pipelines_common import PipelineTesterMixin
enable_full_determinism()
class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
pipeline_class = WuerstchenPriorPipeline
params = ["prompt"]
batch_params = ["prompt", "negative_prompt"]
required_optional_params = [
"num_images_per_prompt",
"generator",
"num_inference_steps",
"latents",
"negative_prompt",
"guidance_scale",
"output_type",
"return_dict",
]
test_xformers_attention = False
callback_cfg_params = ["text_encoder_hidden_states"]
@property
def text_embedder_hidden_size(self):
return 32
@property
def time_input_dim(self):
return 32
@property
def block_out_channels_0(self):
return self.time_input_dim
@property
def time_embed_dim(self):
return self.time_input_dim * 4
@property
def dummy_tokenizer(self):
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
return tokenizer
@property
def dummy_text_encoder(self):
torch.manual_seed(0)
config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=self.text_embedder_hidden_size,
intermediate_size=37,
layer_norm_eps=1e-05,
num_attention_heads=4,
num_hidden_layers=5,
pad_token_id=1,
vocab_size=1000,
)
return CLIPTextModel(config).eval()
@property
def dummy_prior(self):
torch.manual_seed(0)
model_kwargs = {
"c_in": 2,
"c": 8,
"depth": 2,
"c_cond": 32,
"c_r": 8,
"nhead": 2,
}
model = WuerstchenPrior(**model_kwargs)
return model.eval()
def get_dummy_components(self):
prior = self.dummy_prior
text_encoder = self.dummy_text_encoder
tokenizer = self.dummy_tokenizer
scheduler = DDPMWuerstchenScheduler()
components = {
"prior": prior,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"scheduler": scheduler,
}
return components
def get_dummy_inputs(self, device, seed=0):
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "horse",
"generator": generator,
"guidance_scale": 4.0,
"num_inference_steps": 2,
"output_type": "np",
}
return inputs
def test_wuerstchen_prior(self):
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output = pipe(**self.get_dummy_inputs(device))
image = output.image_embeddings
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
image_slice = image[0, 0, 0, -10:]
image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
assert image.shape == (1, 2, 24, 24)
expected_slice = np.array(
[
-7172.837,
-3438.855,
-1093.312,
388.8835,
-7471.467,
-7998.1206,
-5328.259,
218.00089,
-2731.5745,
-8056.734,
]
)
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-2
@skip_mps
def test_inference_batch_single_identical(self):
self._test_inference_batch_single_identical(
expected_max_diff=3e-1,
)
@skip_mps
def test_attention_slicing_forward_pass(self):
test_max_difference = torch_device == "cpu"
test_mean_pixel_difference = False
self._test_attention_slicing_forward_pass(
test_max_difference=test_max_difference,
test_mean_pixel_difference=test_mean_pixel_difference,
)
@unittest.skip(reason="flaky for now")
def test_float16_inference(self):
super().test_float16_inference()
# override because we need to make sure latent_mean and latent_std to be 0
def test_callback_inputs(self):
components = self.get_dummy_components()
components["latent_mean"] = 0
components["latent_std"] = 0
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
self.assertTrue(
hasattr(pipe, "_callback_tensor_inputs"),
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
)
def callback_inputs_test(pipe, i, t, callback_kwargs):
missing_callback_inputs = set()
for v in pipe._callback_tensor_inputs:
if v not in callback_kwargs:
missing_callback_inputs.add(v)
self.assertTrue(
len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
)
last_i = pipe.num_timesteps - 1
if i == last_i:
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
return callback_kwargs
inputs = self.get_dummy_inputs(torch_device)
inputs["callback_on_step_end"] = callback_inputs_test
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
inputs["output_type"] = "latent"
output = pipe(**inputs)[0]
assert output.abs().sum() == 0
def check_if_lora_correctly_set(self, model) -> bool:
"""
Checks if the LoRA layers are correctly set with peft
"""
for module in model.modules():
if isinstance(module, BaseTunerLayer):
return True
return False
def get_lora_components(self):
prior = self.dummy_prior
prior_lora_config = LoraConfig(
r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False
)
return prior, prior_lora_config
@require_peft_backend
def test_inference_with_prior_lora(self):
_, prior_lora_config = self.get_lora_components()
device = "cpu"
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(device)
pipe.set_progress_bar_config(disable=None)
output_no_lora = pipe(**self.get_dummy_inputs(device))
image_embed = output_no_lora.image_embeddings
self.assertTrue(image_embed.shape == (1, 2, 24, 24))
pipe.prior.add_adapter(prior_lora_config)
self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior")
output_lora = pipe(**self.get_dummy_inputs(device))
lora_image_embed = output_lora.image_embeddings
self.assertTrue(image_embed.shape == lora_image_embed.shape)
@unittest.skip("Test not supported as dtype cannot be inferred without the text encoder otherwise.")
def test_encode_prompt_works_in_isolation(self):
pass