mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
[tests] remove tests for deprecated pipelines. (#11879)
* remove tests for deprecated pipelines. * remove folders * test_pipelines_common
This commit is contained in:
@@ -1,171 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import AmusedPipeline, AmusedScheduler, UVit2DModel, VQModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class AmusedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = AmusedPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS | {"encoder_hidden_states", "negative_encoder_hidden_states"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
test_layerwise_casting = True
|
||||
test_group_offloading = True
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = UVit2DModel(
|
||||
hidden_size=8,
|
||||
use_bias=False,
|
||||
hidden_dropout=0.0,
|
||||
cond_embed_dim=8,
|
||||
micro_cond_encode_dim=2,
|
||||
micro_cond_embed_dim=10,
|
||||
encoder_hidden_size=8,
|
||||
vocab_size=32,
|
||||
codebook_size=8,
|
||||
in_channels=8,
|
||||
block_out_channels=8,
|
||||
num_res_blocks=1,
|
||||
downsample=True,
|
||||
upsample=True,
|
||||
block_num_heads=1,
|
||||
num_hidden_layers=1,
|
||||
num_attention_heads=1,
|
||||
attention_dropout=0.0,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-06,
|
||||
ln_elementwise_affine=True,
|
||||
)
|
||||
scheduler = AmusedScheduler(mask_token_id=31)
|
||||
torch.manual_seed(0)
|
||||
vqvae = VQModel(
|
||||
act_fn="silu",
|
||||
block_out_channels=[8],
|
||||
down_block_types=["DownEncoderBlock2D"],
|
||||
in_channels=3,
|
||||
latent_channels=8,
|
||||
layers_per_block=1,
|
||||
norm_num_groups=8,
|
||||
num_vq_embeddings=8,
|
||||
out_channels=3,
|
||||
sample_size=8,
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
mid_block_add_attention=False,
|
||||
lookup_from_codebook=True,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=1,
|
||||
num_hidden_layers=1,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=8,
|
||||
)
|
||||
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
components = {
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
"vqvae": vqvae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
"height": 4,
|
||||
"width": 4,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference_batch_consistent(self, batch_sizes=[2]):
|
||||
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
|
||||
|
||||
@unittest.skip("aMUSEd does not support lists of generators")
|
||||
def test_inference_batch_single_identical(self): ...
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class AmusedPipelineSlowTests(unittest.TestCase):
|
||||
def test_amused_256(self):
|
||||
pipe = AmusedPipeline.from_pretrained("amused/amused-256")
|
||||
pipe.to(torch_device)
|
||||
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.4011, 0.3992, 0.379, 0.3856, 0.3772, 0.3711, 0.3919, 0.385, 0.3625])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.003
|
||||
|
||||
def test_amused_256_fp16(self):
|
||||
pipe = AmusedPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.0554, 0.05129, 0.0344, 0.0452, 0.0476, 0.0271, 0.0495, 0.0527, 0.0158])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.007
|
||||
|
||||
def test_amused_512(self):
|
||||
pipe = AmusedPipeline.from_pretrained("amused/amused-512")
|
||||
pipe.to(torch_device)
|
||||
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.1199, 0.1171, 0.1229, 0.1188, 0.1210, 0.1147, 0.1260, 0.1346, 0.1152])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.003
|
||||
|
||||
def test_amused_512_fp16(self):
|
||||
pipe = AmusedPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
image = pipe("dog", generator=torch.Generator().manual_seed(0), num_inference_steps=2, output_type="np").images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.1509, 0.1492, 0.1531, 0.1485, 0.1501, 0.1465, 0.1581, 0.1690, 0.1499])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.003
|
||||
@@ -1,215 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import AmusedImg2ImgPipeline, AmusedScheduler, UVit2DModel, VQModel
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class AmusedImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = AmusedImg2ImgPipeline
|
||||
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "latents"}
|
||||
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
|
||||
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = UVit2DModel(
|
||||
hidden_size=8,
|
||||
use_bias=False,
|
||||
hidden_dropout=0.0,
|
||||
cond_embed_dim=8,
|
||||
micro_cond_encode_dim=2,
|
||||
micro_cond_embed_dim=10,
|
||||
encoder_hidden_size=8,
|
||||
vocab_size=32,
|
||||
codebook_size=8,
|
||||
in_channels=8,
|
||||
block_out_channels=8,
|
||||
num_res_blocks=1,
|
||||
downsample=True,
|
||||
upsample=True,
|
||||
block_num_heads=1,
|
||||
num_hidden_layers=1,
|
||||
num_attention_heads=1,
|
||||
attention_dropout=0.0,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-06,
|
||||
ln_elementwise_affine=True,
|
||||
)
|
||||
scheduler = AmusedScheduler(mask_token_id=31)
|
||||
torch.manual_seed(0)
|
||||
vqvae = VQModel(
|
||||
act_fn="silu",
|
||||
block_out_channels=[8],
|
||||
down_block_types=["DownEncoderBlock2D"],
|
||||
in_channels=3,
|
||||
latent_channels=8,
|
||||
layers_per_block=1,
|
||||
norm_num_groups=8,
|
||||
num_vq_embeddings=32,
|
||||
out_channels=3,
|
||||
sample_size=8,
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
mid_block_add_attention=False,
|
||||
lookup_from_codebook=True,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=1,
|
||||
num_hidden_layers=1,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=8,
|
||||
)
|
||||
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
components = {
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
"vqvae": vqvae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
"image": image,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference_batch_consistent(self, batch_sizes=[2]):
|
||||
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
|
||||
|
||||
@unittest.skip("aMUSEd does not support lists of generators")
|
||||
def test_inference_batch_single_identical(self): ...
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class AmusedImg2ImgPipelineSlowTests(unittest.TestCase):
|
||||
def test_amused_256(self):
|
||||
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256")
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
|
||||
.resize((256, 256))
|
||||
.convert("RGB")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.9993, 1.0, 0.9996, 1.0, 0.9995, 0.9925, 0.999, 0.9954, 1.0])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.01
|
||||
|
||||
def test_amused_256_fp16(self):
|
||||
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-256", torch_dtype=torch.float16, variant="fp16")
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
|
||||
.resize((256, 256))
|
||||
.convert("RGB")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.998, 0.998, 0.994, 0.9944, 0.996, 0.9908, 1.0, 1.0, 0.9986])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.01
|
||||
|
||||
def test_amused_512(self):
|
||||
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512")
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
|
||||
.resize((512, 512))
|
||||
.convert("RGB")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.2809, 0.1879, 0.2027, 0.2418, 0.1852, 0.2145, 0.2484, 0.2425, 0.2317])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.1
|
||||
|
||||
def test_amused_512_fp16(self):
|
||||
pipe = AmusedImg2ImgPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg")
|
||||
.resize((512, 512))
|
||||
.convert("RGB")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.2795, 0.1867, 0.2028, 0.2450, 0.1856, 0.2140, 0.2473, 0.2406, 0.2313])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.1
|
||||
@@ -1,281 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import AmusedInpaintPipeline, AmusedScheduler, UVit2DModel, VQModel
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils.testing_utils import (
|
||||
Expectations,
|
||||
enable_full_determinism,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class AmusedInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = AmusedInpaintPipeline
|
||||
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"width", "height"}
|
||||
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
|
||||
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
transformer = UVit2DModel(
|
||||
hidden_size=8,
|
||||
use_bias=False,
|
||||
hidden_dropout=0.0,
|
||||
cond_embed_dim=8,
|
||||
micro_cond_encode_dim=2,
|
||||
micro_cond_embed_dim=10,
|
||||
encoder_hidden_size=8,
|
||||
vocab_size=32,
|
||||
codebook_size=32,
|
||||
in_channels=8,
|
||||
block_out_channels=8,
|
||||
num_res_blocks=1,
|
||||
downsample=True,
|
||||
upsample=True,
|
||||
block_num_heads=1,
|
||||
num_hidden_layers=1,
|
||||
num_attention_heads=1,
|
||||
attention_dropout=0.0,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-06,
|
||||
ln_elementwise_affine=True,
|
||||
)
|
||||
scheduler = AmusedScheduler(mask_token_id=31)
|
||||
torch.manual_seed(0)
|
||||
vqvae = VQModel(
|
||||
act_fn="silu",
|
||||
block_out_channels=[8],
|
||||
down_block_types=["DownEncoderBlock2D"],
|
||||
in_channels=3,
|
||||
latent_channels=8,
|
||||
layers_per_block=1,
|
||||
norm_num_groups=8,
|
||||
num_vq_embeddings=32,
|
||||
out_channels=3,
|
||||
sample_size=8,
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
mid_block_add_attention=False,
|
||||
lookup_from_codebook=True,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
intermediate_size=8,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=1,
|
||||
num_hidden_layers=1,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=8,
|
||||
)
|
||||
text_encoder = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
components = {
|
||||
"transformer": transformer,
|
||||
"scheduler": scheduler,
|
||||
"vqvae": vqvae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
image = torch.full((1, 3, 4, 4), 1.0, dtype=torch.float32, device=device)
|
||||
mask_image = torch.full((1, 1, 4, 4), 1.0, dtype=torch.float32, device=device)
|
||||
mask_image[0, 0, 0, 0] = 0
|
||||
mask_image[0, 0, 0, 1] = 0
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
"image": image,
|
||||
"mask_image": mask_image,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference_batch_consistent(self, batch_sizes=[2]):
|
||||
self._test_inference_batch_consistent(batch_sizes=batch_sizes, batch_generator=False)
|
||||
|
||||
@unittest.skip("aMUSEd does not support lists of generators")
|
||||
def test_inference_batch_single_identical(self): ...
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class AmusedInpaintPipelineSlowTests(unittest.TestCase):
|
||||
def test_amused_256(self):
|
||||
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256")
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
|
||||
.resize((256, 256))
|
||||
.convert("RGB")
|
||||
)
|
||||
mask_image = (
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
|
||||
)
|
||||
.resize((256, 256))
|
||||
.convert("L")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
mask_image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.0699, 0.0716, 0.0608, 0.0715, 0.0797, 0.0638, 0.0802, 0.0924, 0.0634])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.1
|
||||
|
||||
def test_amused_256_fp16(self):
|
||||
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-256", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
|
||||
.resize((256, 256))
|
||||
.convert("RGB")
|
||||
)
|
||||
mask_image = (
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
|
||||
)
|
||||
.resize((256, 256))
|
||||
.convert("L")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
mask_image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 256, 256, 3)
|
||||
expected_slice = np.array([0.0735, 0.0749, 0.065, 0.0739, 0.0805, 0.0667, 0.0802, 0.0923, 0.0622])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.1
|
||||
|
||||
def test_amused_512(self):
|
||||
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512")
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
|
||||
.resize((512, 512))
|
||||
.convert("RGB")
|
||||
)
|
||||
mask_image = (
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
|
||||
)
|
||||
.resize((512, 512))
|
||||
.convert("L")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
mask_image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0005, 0.0])
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.05
|
||||
|
||||
def test_amused_512_fp16(self):
|
||||
pipe = AmusedInpaintPipeline.from_pretrained("amused/amused-512", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
image = (
|
||||
load_image("https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg")
|
||||
.resize((512, 512))
|
||||
.convert("RGB")
|
||||
)
|
||||
mask_image = (
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
|
||||
)
|
||||
.resize((512, 512))
|
||||
.convert("L")
|
||||
)
|
||||
image = pipe(
|
||||
"winter mountains",
|
||||
image,
|
||||
mask_image,
|
||||
generator=torch.Generator().manual_seed(0),
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): np.array(
|
||||
[
|
||||
0.0274,
|
||||
0.0211,
|
||||
0.0154,
|
||||
0.0257,
|
||||
0.0299,
|
||||
0.0170,
|
||||
0.0326,
|
||||
0.0420,
|
||||
0.0150,
|
||||
]
|
||||
),
|
||||
("cuda", 7): np.array(
|
||||
[
|
||||
0.0227,
|
||||
0.0157,
|
||||
0.0098,
|
||||
0.0213,
|
||||
0.0250,
|
||||
0.0127,
|
||||
0.0280,
|
||||
0.0380,
|
||||
0.0095,
|
||||
]
|
||||
),
|
||||
}
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
assert np.abs(image_slice - expected_slice).max() < 0.003
|
||||
@@ -1,461 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from transformers import (
|
||||
ClapTextConfig,
|
||||
ClapTextModelWithProjection,
|
||||
RobertaTokenizer,
|
||||
SpeechT5HifiGan,
|
||||
SpeechT5HifiGanConfig,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AudioLDMPipeline,
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import backend_empty_cache, enable_full_determinism, nightly, torch_device
|
||||
|
||||
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = AudioLDMPipeline
|
||||
params = TEXT_TO_AUDIO_PARAMS
|
||||
batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"num_waveforms_per_prompt",
|
||||
"generator",
|
||||
"latents",
|
||||
"output_type",
|
||||
"return_dict",
|
||||
"callback",
|
||||
"callback_steps",
|
||||
]
|
||||
)
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(8, 16),
|
||||
layers_per_block=1,
|
||||
norm_num_groups=8,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=(8, 16),
|
||||
class_embed_type="simple_projection",
|
||||
projection_class_embeddings_input_dim=8,
|
||||
class_embeddings_concat=True,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[8, 16],
|
||||
in_channels=1,
|
||||
out_channels=1,
|
||||
norm_num_groups=8,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = ClapTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=1,
|
||||
num_hidden_layers=1,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
projection_dim=8,
|
||||
)
|
||||
text_encoder = ClapTextModelWithProjection(text_encoder_config)
|
||||
tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
|
||||
|
||||
vocoder_config = SpeechT5HifiGanConfig(
|
||||
model_in_dim=8,
|
||||
sampling_rate=16000,
|
||||
upsample_initial_channel=16,
|
||||
upsample_rates=[2, 2],
|
||||
upsample_kernel_sizes=[4, 4],
|
||||
resblock_kernel_sizes=[3, 7],
|
||||
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
|
||||
normalize_before=False,
|
||||
)
|
||||
|
||||
vocoder = SpeechT5HifiGan(vocoder_config)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"vocoder": vocoder,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A hammer hitting a wooden surface",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_audioldm_ddim(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
components = self.get_dummy_components()
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = audioldm_pipe(**inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 256
|
||||
|
||||
audio_slice = audio[:10]
|
||||
expected_slice = np.array(
|
||||
[-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
|
||||
)
|
||||
|
||||
assert np.abs(audio_slice - expected_slice).max() < 1e-2
|
||||
|
||||
def test_audioldm_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt"] = 3 * [inputs["prompt"]]
|
||||
|
||||
# forward
|
||||
output = audioldm_pipe(**inputs)
|
||||
audio_1 = output.audios[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = 3 * [inputs.pop("prompt")]
|
||||
|
||||
text_inputs = audioldm_pipe.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=audioldm_pipe.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
prompt_embeds = audioldm_pipe.text_encoder(
|
||||
text_inputs,
|
||||
)
|
||||
prompt_embeds = prompt_embeds.text_embeds
|
||||
# additional L_2 normalization over each hidden-state
|
||||
prompt_embeds = F.normalize(prompt_embeds, dim=-1)
|
||||
|
||||
inputs["prompt_embeds"] = prompt_embeds
|
||||
|
||||
# forward
|
||||
output = audioldm_pipe(**inputs)
|
||||
audio_2 = output.audios[0]
|
||||
|
||||
assert np.abs(audio_1 - audio_2).max() < 1e-2
|
||||
|
||||
def test_audioldm_negative_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
negative_prompt = 3 * ["this is a negative prompt"]
|
||||
inputs["negative_prompt"] = negative_prompt
|
||||
inputs["prompt"] = 3 * [inputs["prompt"]]
|
||||
|
||||
# forward
|
||||
output = audioldm_pipe(**inputs)
|
||||
audio_1 = output.audios[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = 3 * [inputs.pop("prompt")]
|
||||
|
||||
embeds = []
|
||||
for p in [prompt, negative_prompt]:
|
||||
text_inputs = audioldm_pipe.tokenizer(
|
||||
p,
|
||||
padding="max_length",
|
||||
max_length=audioldm_pipe.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
text_embeds = audioldm_pipe.text_encoder(
|
||||
text_inputs,
|
||||
)
|
||||
text_embeds = text_embeds.text_embeds
|
||||
# additional L_2 normalization over each hidden-state
|
||||
text_embeds = F.normalize(text_embeds, dim=-1)
|
||||
|
||||
embeds.append(text_embeds)
|
||||
|
||||
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
|
||||
|
||||
# forward
|
||||
output = audioldm_pipe(**inputs)
|
||||
audio_2 = output.audios[0]
|
||||
|
||||
assert np.abs(audio_1 - audio_2).max() < 1e-2
|
||||
|
||||
def test_audioldm_negative_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
negative_prompt = "egg cracking"
|
||||
output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 256
|
||||
|
||||
audio_slice = audio[:10]
|
||||
expected_slice = np.array(
|
||||
[-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
|
||||
)
|
||||
|
||||
assert np.abs(audio_slice - expected_slice).max() < 1e-2
|
||||
|
||||
def test_audioldm_num_waveforms_per_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A hammer hitting a wooden surface"
|
||||
|
||||
# test num_waveforms_per_prompt=1 (default)
|
||||
audios = audioldm_pipe(prompt, num_inference_steps=2).audios
|
||||
|
||||
assert audios.shape == (1, 256)
|
||||
|
||||
# test num_waveforms_per_prompt=1 (default) for batch of prompts
|
||||
batch_size = 2
|
||||
audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
|
||||
|
||||
assert audios.shape == (batch_size, 256)
|
||||
|
||||
# test num_waveforms_per_prompt for single prompt
|
||||
num_waveforms_per_prompt = 2
|
||||
audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
|
||||
|
||||
assert audios.shape == (num_waveforms_per_prompt, 256)
|
||||
|
||||
# test num_waveforms_per_prompt for batch of prompts
|
||||
batch_size = 2
|
||||
audios = audioldm_pipe(
|
||||
[prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
|
||||
).audios
|
||||
|
||||
assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
|
||||
|
||||
def test_audioldm_audio_length_in_s(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) / vocoder_sampling_rate == 0.016
|
||||
|
||||
output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) / vocoder_sampling_rate == 0.032
|
||||
|
||||
def test_audioldm_vocoder_model_in_dim(self):
|
||||
components = self.get_dummy_components()
|
||||
audioldm_pipe = AudioLDMPipeline(**components)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = ["hey"]
|
||||
|
||||
output = audioldm_pipe(prompt, num_inference_steps=1)
|
||||
audio_shape = output.audios.shape
|
||||
assert audio_shape == (1, 256)
|
||||
|
||||
config = audioldm_pipe.vocoder.config
|
||||
config.model_in_dim *= 2
|
||||
audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
|
||||
output = audioldm_pipe(prompt, num_inference_steps=1)
|
||||
audio_shape = output.audios.shape
|
||||
# waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
|
||||
assert audio_shape == (1, 256)
|
||||
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical()
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
|
||||
|
||||
|
||||
@nightly
|
||||
class AudioLDMPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
|
||||
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
|
||||
inputs = {
|
||||
"prompt": "A hammer hitting a wooden surface",
|
||||
"latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 2.5,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_audioldm(self):
|
||||
audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
inputs["num_inference_steps"] = 25
|
||||
audio = audioldm_pipe(**inputs).audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 81920
|
||||
|
||||
audio_slice = audio[77230:77240]
|
||||
expected_slice = np.array(
|
||||
[-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
|
||||
)
|
||||
max_diff = np.abs(expected_slice - audio_slice).max()
|
||||
assert max_diff < 1e-2
|
||||
|
||||
|
||||
@nightly
|
||||
class AudioLDMPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
|
||||
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
|
||||
inputs = {
|
||||
"prompt": "A hammer hitting a wooden surface",
|
||||
"latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 2.5,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_audioldm_lms(self):
|
||||
audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
|
||||
audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
|
||||
audioldm_pipe = audioldm_pipe.to(torch_device)
|
||||
audioldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
audio = audioldm_pipe(**inputs).audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 81920
|
||||
|
||||
audio_slice = audio[27780:27790]
|
||||
expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
|
||||
max_diff = np.abs(expected_slice - audio_slice).max()
|
||||
assert max_diff < 3e-2
|
||||
@@ -1,204 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPTokenizer
|
||||
from transformers.models.blip_2.configuration_blip_2 import Blip2Config
|
||||
from transformers.models.clip.configuration_clip import CLIPTextConfig
|
||||
|
||||
from diffusers import AutoencoderKL, BlipDiffusionPipeline, PNDMScheduler, UNet2DConditionModel
|
||||
from diffusers.utils.testing_utils import enable_full_determinism
|
||||
from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
|
||||
from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
|
||||
from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class BlipDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = BlipDiffusionPipeline
|
||||
params = [
|
||||
"prompt",
|
||||
"reference_image",
|
||||
"source_subject_category",
|
||||
"target_subject_category",
|
||||
]
|
||||
batch_params = [
|
||||
"prompt",
|
||||
"reference_image",
|
||||
"source_subject_category",
|
||||
"target_subject_category",
|
||||
]
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
"height",
|
||||
"width",
|
||||
"latents",
|
||||
"guidance_scale",
|
||||
"num_inference_steps",
|
||||
"neg_prompt",
|
||||
"guidance_scale",
|
||||
"prompt_strength",
|
||||
"prompt_reps",
|
||||
]
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=8,
|
||||
intermediate_size=8,
|
||||
projection_dim=8,
|
||||
num_hidden_layers=1,
|
||||
num_attention_heads=1,
|
||||
max_position_embeddings=77,
|
||||
)
|
||||
text_encoder = ContextCLIPTextModel(text_encoder_config)
|
||||
|
||||
vae = AutoencoderKL(
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownEncoderBlock2D",),
|
||||
up_block_types=("UpDecoderBlock2D",),
|
||||
block_out_channels=(8,),
|
||||
norm_num_groups=8,
|
||||
layers_per_block=1,
|
||||
act_fn="silu",
|
||||
latent_channels=4,
|
||||
sample_size=8,
|
||||
)
|
||||
|
||||
blip_vision_config = {
|
||||
"hidden_size": 8,
|
||||
"intermediate_size": 8,
|
||||
"num_hidden_layers": 1,
|
||||
"num_attention_heads": 1,
|
||||
"image_size": 224,
|
||||
"patch_size": 14,
|
||||
"hidden_act": "quick_gelu",
|
||||
}
|
||||
|
||||
blip_qformer_config = {
|
||||
"vocab_size": 1000,
|
||||
"hidden_size": 8,
|
||||
"num_hidden_layers": 1,
|
||||
"num_attention_heads": 1,
|
||||
"intermediate_size": 8,
|
||||
"max_position_embeddings": 512,
|
||||
"cross_attention_frequency": 1,
|
||||
"encoder_hidden_size": 8,
|
||||
}
|
||||
qformer_config = Blip2Config(
|
||||
vision_config=blip_vision_config,
|
||||
qformer_config=blip_qformer_config,
|
||||
num_query_tokens=8,
|
||||
tokenizer="hf-internal-testing/tiny-random-bert",
|
||||
)
|
||||
qformer = Blip2QFormerModel(qformer_config)
|
||||
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(8, 16),
|
||||
norm_num_groups=8,
|
||||
layers_per_block=1,
|
||||
sample_size=16,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=8,
|
||||
)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
scheduler = PNDMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
set_alpha_to_one=False,
|
||||
skip_prk_steps=True,
|
||||
)
|
||||
|
||||
vae.eval()
|
||||
qformer.eval()
|
||||
text_encoder.eval()
|
||||
|
||||
image_processor = BlipImageProcessor()
|
||||
|
||||
components = {
|
||||
"text_encoder": text_encoder,
|
||||
"vae": vae,
|
||||
"qformer": qformer,
|
||||
"unet": unet,
|
||||
"tokenizer": tokenizer,
|
||||
"scheduler": scheduler,
|
||||
"image_processor": image_processor,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
np.random.seed(seed)
|
||||
reference_image = np.random.rand(32, 32, 3) * 255
|
||||
reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "swimming underwater",
|
||||
"generator": generator,
|
||||
"reference_image": reference_image,
|
||||
"source_subject_category": "dog",
|
||||
"target_subject_category": "dog",
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"guidance_scale": 7.5,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_blipdiffusion(self):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
image = pipe(**self.get_dummy_inputs(device))[0]
|
||||
image_slice = image[0, -3:, -3:, 0]
|
||||
|
||||
assert image.shape == (1, 16, 16, 4)
|
||||
|
||||
expected_slice = np.array(
|
||||
[0.5329548, 0.8372512, 0.33269387, 0.82096875, 0.43657133, 0.3783, 0.5953028, 0.51934963, 0.42142007]
|
||||
)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {image_slice.flatten()}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
@unittest.skip("Test not supported because of complexities in deriving query_embeds.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
@@ -1,228 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPTokenizer
|
||||
from transformers.models.blip_2.configuration_blip_2 import Blip2Config
|
||||
from transformers.models.clip.configuration_clip import CLIPTextConfig
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
BlipDiffusionControlNetPipeline,
|
||||
ControlNetModel,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
|
||||
from src.diffusers.pipelines.blip_diffusion.blip_image_processing import BlipImageProcessor
|
||||
from src.diffusers.pipelines.blip_diffusion.modeling_blip2 import Blip2QFormerModel
|
||||
from src.diffusers.pipelines.blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class BlipDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = BlipDiffusionControlNetPipeline
|
||||
params = [
|
||||
"prompt",
|
||||
"reference_image",
|
||||
"source_subject_category",
|
||||
"target_subject_category",
|
||||
"condtioning_image",
|
||||
]
|
||||
batch_params = [
|
||||
"prompt",
|
||||
"reference_image",
|
||||
"source_subject_category",
|
||||
"target_subject_category",
|
||||
"condtioning_image",
|
||||
]
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
"height",
|
||||
"width",
|
||||
"latents",
|
||||
"guidance_scale",
|
||||
"num_inference_steps",
|
||||
"neg_prompt",
|
||||
"guidance_scale",
|
||||
"prompt_strength",
|
||||
"prompt_reps",
|
||||
]
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
vocab_size=1000,
|
||||
hidden_size=16,
|
||||
intermediate_size=16,
|
||||
projection_dim=16,
|
||||
num_hidden_layers=1,
|
||||
num_attention_heads=1,
|
||||
max_position_embeddings=77,
|
||||
)
|
||||
text_encoder = ContextCLIPTextModel(text_encoder_config)
|
||||
|
||||
vae = AutoencoderKL(
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownEncoderBlock2D",),
|
||||
up_block_types=("UpDecoderBlock2D",),
|
||||
block_out_channels=(32,),
|
||||
layers_per_block=1,
|
||||
act_fn="silu",
|
||||
latent_channels=4,
|
||||
norm_num_groups=16,
|
||||
sample_size=16,
|
||||
)
|
||||
|
||||
blip_vision_config = {
|
||||
"hidden_size": 16,
|
||||
"intermediate_size": 16,
|
||||
"num_hidden_layers": 1,
|
||||
"num_attention_heads": 1,
|
||||
"image_size": 224,
|
||||
"patch_size": 14,
|
||||
"hidden_act": "quick_gelu",
|
||||
}
|
||||
|
||||
blip_qformer_config = {
|
||||
"vocab_size": 1000,
|
||||
"hidden_size": 16,
|
||||
"num_hidden_layers": 1,
|
||||
"num_attention_heads": 1,
|
||||
"intermediate_size": 16,
|
||||
"max_position_embeddings": 512,
|
||||
"cross_attention_frequency": 1,
|
||||
"encoder_hidden_size": 16,
|
||||
}
|
||||
qformer_config = Blip2Config(
|
||||
vision_config=blip_vision_config,
|
||||
qformer_config=blip_qformer_config,
|
||||
num_query_tokens=16,
|
||||
tokenizer="hf-internal-testing/tiny-random-bert",
|
||||
)
|
||||
qformer = Blip2QFormerModel(qformer_config)
|
||||
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(4, 16),
|
||||
layers_per_block=1,
|
||||
norm_num_groups=4,
|
||||
sample_size=16,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=16,
|
||||
)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
scheduler = PNDMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
set_alpha_to_one=False,
|
||||
skip_prk_steps=True,
|
||||
)
|
||||
controlnet = ControlNetModel(
|
||||
block_out_channels=(4, 16),
|
||||
layers_per_block=1,
|
||||
in_channels=4,
|
||||
norm_num_groups=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
cross_attention_dim=16,
|
||||
conditioning_embedding_out_channels=(8, 16),
|
||||
)
|
||||
|
||||
vae.eval()
|
||||
qformer.eval()
|
||||
text_encoder.eval()
|
||||
|
||||
image_processor = BlipImageProcessor()
|
||||
|
||||
components = {
|
||||
"text_encoder": text_encoder,
|
||||
"vae": vae,
|
||||
"qformer": qformer,
|
||||
"unet": unet,
|
||||
"tokenizer": tokenizer,
|
||||
"scheduler": scheduler,
|
||||
"controlnet": controlnet,
|
||||
"image_processor": image_processor,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
np.random.seed(seed)
|
||||
reference_image = np.random.rand(32, 32, 3) * 255
|
||||
reference_image = Image.fromarray(reference_image.astype("uint8")).convert("RGBA")
|
||||
cond_image = np.random.rand(32, 32, 3) * 255
|
||||
cond_image = Image.fromarray(cond_image.astype("uint8")).convert("RGBA")
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "swimming underwater",
|
||||
"generator": generator,
|
||||
"reference_image": reference_image,
|
||||
"condtioning_image": cond_image,
|
||||
"source_subject_category": "dog",
|
||||
"target_subject_category": "dog",
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"guidance_scale": 7.5,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.4803, 0.3865, 0.1422, 0.6119, 0.2283, 0.6365, 0.5453, 0.5205, 0.3581])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
|
||||
def test_blipdiffusion_controlnet(self):
|
||||
device = "cpu"
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
image = pipe(**self.get_dummy_inputs(device))[0]
|
||||
image_slice = image[0, -3:, -3:, 0]
|
||||
|
||||
assert image.shape == (1, 16, 16, 4)
|
||||
expected_slice = np.array([0.7953, 0.7136, 0.6597, 0.4779, 0.7389, 0.4111, 0.5826, 0.4150, 0.8422])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
|
||||
@unittest.skip("Test not supported because of complexities in deriving query_embeds.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
@@ -1,352 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AsymmetricAutoencoderKL,
|
||||
AutoencoderKL,
|
||||
AutoencoderTiny,
|
||||
ConsistencyDecoderVAE,
|
||||
ControlNetXSAdapter,
|
||||
DDIMScheduler,
|
||||
LCMScheduler,
|
||||
StableDiffusionControlNetXSPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
load_image,
|
||||
require_accelerator,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...models.autoencoders.vae import (
|
||||
get_asym_autoencoder_kl_config,
|
||||
get_autoencoder_kl_config,
|
||||
get_autoencoder_tiny_config,
|
||||
get_consistency_vae_config,
|
||||
)
|
||||
from ..pipeline_params import (
|
||||
IMAGE_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_BATCH_PARAMS,
|
||||
TEXT_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import (
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
SDFunctionTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
def to_np(tensor):
|
||||
if isinstance(tensor, torch.Tensor):
|
||||
tensor = tensor.detach().cpu().numpy()
|
||||
|
||||
return tensor
|
||||
|
||||
|
||||
class ControlNetXSPipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
SDFunctionTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionControlNetXSPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
test_attention_slicing = False
|
||||
test_layerwise_casting = True
|
||||
test_group_offloading = True
|
||||
|
||||
def get_dummy_components(self, time_cond_proj_dim=None):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(4, 8),
|
||||
layers_per_block=2,
|
||||
sample_size=16,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=8,
|
||||
norm_num_groups=4,
|
||||
time_cond_proj_dim=time_cond_proj_dim,
|
||||
use_linear_projection=True,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
controlnet = ControlNetXSAdapter.from_unet(
|
||||
unet=unet,
|
||||
size_ratio=1,
|
||||
learn_time_embedding=True,
|
||||
conditioning_embedding_out_channels=(2, 2),
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[4, 8],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"controlnet": controlnet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
controlnet_embedder_scale_factor = 2
|
||||
image = randn_tensor(
|
||||
(1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
|
||||
generator=generator,
|
||||
device=torch.device(device),
|
||||
)
|
||||
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "numpy",
|
||||
"image": image,
|
||||
}
|
||||
|
||||
return inputs
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
|
||||
|
||||
def test_controlnet_lcm(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
components = self.get_dummy_components(time_cond_proj_dim=8)
|
||||
sd_pipe = StableDiffusionControlNetXSPipeline(**components)
|
||||
sd_pipe.scheduler = LCMScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = sd_pipe(**inputs)
|
||||
image = output.images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 16, 16, 3)
|
||||
expected_slice = np.array([0.745, 0.753, 0.767, 0.543, 0.523, 0.502, 0.314, 0.521, 0.478])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_to_dtype(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
|
||||
|
||||
pipe.to(dtype=torch.float16)
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
|
||||
|
||||
def test_multi_vae(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
block_out_channels = pipe.vae.config.block_out_channels
|
||||
norm_num_groups = pipe.vae.config.norm_num_groups
|
||||
|
||||
vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
|
||||
configs = [
|
||||
get_autoencoder_kl_config(block_out_channels, norm_num_groups),
|
||||
get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
|
||||
get_consistency_vae_config(block_out_channels, norm_num_groups),
|
||||
get_autoencoder_tiny_config(block_out_channels),
|
||||
]
|
||||
|
||||
out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
|
||||
|
||||
for vae_cls, config in zip(vae_classes, configs):
|
||||
vae = vae_cls(**config)
|
||||
vae = vae.to(torch_device)
|
||||
components["vae"] = vae
|
||||
vae_pipe = self.pipeline_class(**components)
|
||||
|
||||
# pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
|
||||
# So we need to move the new pipe to device.
|
||||
vae_pipe.to(torch_device)
|
||||
vae_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
|
||||
|
||||
assert out_vae_np.shape == out_np.shape
|
||||
|
||||
@require_accelerator
|
||||
def test_to_device(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.to("cpu")
|
||||
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the device from pipe.components
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == "cpu" for device in model_devices))
|
||||
|
||||
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
|
||||
self.assertTrue(np.isnan(output_cpu).sum() == 0)
|
||||
|
||||
pipe.to(torch_device)
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == torch_device for device in model_devices))
|
||||
|
||||
output_device = pipe(**self.get_dummy_inputs(torch_device))[0]
|
||||
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class ControlNetXSPipelineSlowTests(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_canny(self):
|
||||
controlnet = ControlNetXSAdapter.from_pretrained(
|
||||
"UmerHA/Testing-ConrolNetXS-SD2.1-canny", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "bird"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
|
||||
)
|
||||
|
||||
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
|
||||
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (768, 512, 3)
|
||||
|
||||
original_image = image[-3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.1963, 0.229, 0.2659, 0.2109, 0.2332, 0.2827, 0.2534, 0.2422, 0.2808])
|
||||
assert np.allclose(original_image, expected_image, atol=1e-04)
|
||||
|
||||
def test_depth(self):
|
||||
controlnet = ControlNetXSAdapter.from_pretrained(
|
||||
"UmerHA/Testing-ConrolNetXS-SD2.1-depth", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = StableDiffusionControlNetXSPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "Stormtrooper's lecture"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
|
||||
)
|
||||
|
||||
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
|
||||
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (512, 512, 3)
|
||||
|
||||
original_image = image[-3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.4844, 0.4937, 0.4956, 0.4663, 0.5039, 0.5044, 0.4565, 0.4883, 0.4941])
|
||||
assert np.allclose(original_image, expected_image, atol=1e-04)
|
||||
@@ -1,393 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AsymmetricAutoencoderKL,
|
||||
AutoencoderKL,
|
||||
AutoencoderTiny,
|
||||
ConsistencyDecoderVAE,
|
||||
ControlNetXSAdapter,
|
||||
EulerDiscreteScheduler,
|
||||
StableDiffusionXLControlNetXSPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.import_utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
load_image,
|
||||
require_torch_accelerator,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ...models.autoencoders.vae import (
|
||||
get_asym_autoencoder_kl_config,
|
||||
get_autoencoder_kl_config,
|
||||
get_autoencoder_tiny_config,
|
||||
get_consistency_vae_config,
|
||||
)
|
||||
from ..pipeline_params import (
|
||||
IMAGE_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_BATCH_PARAMS,
|
||||
TEXT_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import (
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class StableDiffusionXLControlNetXSPipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionXLControlNetXSPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
test_attention_slicing = False
|
||||
test_layerwise_casting = True
|
||||
test_group_offloading = True
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(4, 8),
|
||||
layers_per_block=2,
|
||||
sample_size=16,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
use_linear_projection=True,
|
||||
norm_num_groups=4,
|
||||
# SD2-specific config below
|
||||
attention_head_dim=(2, 4),
|
||||
addition_embed_type="text_time",
|
||||
addition_time_embed_dim=8,
|
||||
transformer_layers_per_block=(1, 2),
|
||||
projection_class_embeddings_input_dim=56, # 6 * 8 (addition_time_embed_dim) + 8 (cross_attention_dim)
|
||||
cross_attention_dim=8,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
controlnet = ControlNetXSAdapter.from_unet(
|
||||
unet=unet,
|
||||
size_ratio=0.5,
|
||||
learn_time_embedding=True,
|
||||
conditioning_embedding_out_channels=(2, 2),
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
scheduler = EulerDiscreteScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
steps_offset=1,
|
||||
beta_schedule="scaled_linear",
|
||||
timestep_spacing="leading",
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[4, 8],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=4,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
# SD2-specific config below
|
||||
hidden_act="gelu",
|
||||
projection_dim=8,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"controlnet": controlnet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
return components
|
||||
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
controlnet_embedder_scale_factor = 2
|
||||
image = randn_tensor(
|
||||
(1, 3, 8 * controlnet_embedder_scale_factor, 8 * controlnet_embedder_scale_factor),
|
||||
generator=generator,
|
||||
device=torch.device(device),
|
||||
)
|
||||
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
"image": image,
|
||||
}
|
||||
|
||||
return inputs
|
||||
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
|
||||
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
|
||||
|
||||
@unittest.skip("We test this functionality elsewhere already.")
|
||||
def test_save_load_optional_components(self):
|
||||
pass
|
||||
|
||||
@require_torch_accelerator
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def test_stable_diffusion_xl_offloads(self):
|
||||
pipes = []
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components).to(torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components)
|
||||
sd_pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components)
|
||||
sd_pipe.enable_sequential_cpu_offload(device=torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
image_slices = []
|
||||
for pipe in pipes:
|
||||
pipe.unet.set_default_attn_processor()
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
image = pipe(**inputs).images
|
||||
|
||||
image_slices.append(image[0, -3:, -3:, -1].flatten())
|
||||
|
||||
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
|
||||
|
||||
# Copied from test_controlnet_sdxl.py
|
||||
def test_stable_diffusion_xl_multi_prompts(self):
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components).to(torch_device)
|
||||
|
||||
# forward with single prompt
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_1 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# forward with same prompt duplicated
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt_2"] = inputs["prompt"]
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_2 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# ensure the results are equal
|
||||
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
|
||||
|
||||
# forward with different prompt
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt_2"] = "different prompt"
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_3 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# ensure the results are not equal
|
||||
assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
|
||||
|
||||
# manually set a negative_prompt
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["negative_prompt"] = "negative prompt"
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_1 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# forward with same negative_prompt duplicated
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["negative_prompt"] = "negative prompt"
|
||||
inputs["negative_prompt_2"] = inputs["negative_prompt"]
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_2 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# ensure the results are equal
|
||||
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
|
||||
|
||||
# forward with different negative_prompt
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["negative_prompt"] = "negative prompt"
|
||||
inputs["negative_prompt_2"] = "different negative prompt"
|
||||
output = sd_pipe(**inputs)
|
||||
image_slice_3 = output.images[0, -3:, -3:, -1]
|
||||
|
||||
# ensure the results are not equal
|
||||
assert np.abs(image_slice_1.flatten() - image_slice_3.flatten()).max() > 1e-4
|
||||
|
||||
# Copied from test_controlnetxs.py
|
||||
def test_to_dtype(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# pipeline creates a new UNetControlNetXSModel under the hood. So we need to check the dtype from pipe.components
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
|
||||
|
||||
pipe.to(dtype=torch.float16)
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
|
||||
|
||||
def test_multi_vae(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
block_out_channels = pipe.vae.config.block_out_channels
|
||||
norm_num_groups = pipe.vae.config.norm_num_groups
|
||||
|
||||
vae_classes = [AutoencoderKL, AsymmetricAutoencoderKL, ConsistencyDecoderVAE, AutoencoderTiny]
|
||||
configs = [
|
||||
get_autoencoder_kl_config(block_out_channels, norm_num_groups),
|
||||
get_asym_autoencoder_kl_config(block_out_channels, norm_num_groups),
|
||||
get_consistency_vae_config(block_out_channels, norm_num_groups),
|
||||
get_autoencoder_tiny_config(block_out_channels),
|
||||
]
|
||||
|
||||
out_np = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
|
||||
|
||||
for vae_cls, config in zip(vae_classes, configs):
|
||||
vae = vae_cls(**config)
|
||||
vae = vae.to(torch_device)
|
||||
components["vae"] = vae
|
||||
vae_pipe = self.pipeline_class(**components)
|
||||
|
||||
# pipeline creates a new UNetControlNetXSModel under the hood, which aren't on device.
|
||||
# So we need to move the new pipe to device.
|
||||
vae_pipe.to(torch_device)
|
||||
vae_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
out_vae_np = vae_pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="np"))[0]
|
||||
|
||||
assert out_vae_np.shape == out_np.shape
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionXLControlNetXSPipelineSlowTests(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_canny(self):
|
||||
controlnet = ControlNetXSAdapter.from_pretrained(
|
||||
"UmerHA/Testing-ConrolNetXS-SDXL-canny", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_sequential_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "bird"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
|
||||
)
|
||||
|
||||
images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
|
||||
|
||||
assert images[0].shape == (768, 512, 3)
|
||||
|
||||
original_image = images[0, -3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.3202, 0.3151, 0.3328, 0.3172, 0.337, 0.3381, 0.3378, 0.3389, 0.3224])
|
||||
assert np.allclose(original_image, expected_image, atol=1e-04)
|
||||
|
||||
def test_depth(self):
|
||||
controlnet = ControlNetXSAdapter.from_pretrained(
|
||||
"UmerHA/Testing-ConrolNetXS-SDXL-depth", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = StableDiffusionXLControlNetXSPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_sequential_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
prompt = "Stormtrooper's lecture"
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
|
||||
)
|
||||
|
||||
images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images
|
||||
|
||||
assert images[0].shape == (512, 512, 3)
|
||||
|
||||
original_image = images[0, -3:, -3:, -1].flatten()
|
||||
expected_image = np.array([0.5448, 0.5437, 0.5426, 0.5543, 0.553, 0.5475, 0.5595, 0.5602, 0.5529])
|
||||
assert np.allclose(original_image, expected_image, atol=1e-04)
|
||||
@@ -1,174 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = DanceDiffusionPipeline
|
||||
params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
|
||||
required_optional_params = PipelineTesterMixin.required_optional_params - {
|
||||
"callback",
|
||||
"latents",
|
||||
"callback_steps",
|
||||
"output_type",
|
||||
"num_images_per_prompt",
|
||||
}
|
||||
batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
|
||||
test_attention_slicing = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet1DModel(
|
||||
block_out_channels=(32, 32, 64),
|
||||
extra_in_channels=16,
|
||||
sample_size=512,
|
||||
sample_rate=16_000,
|
||||
in_channels=2,
|
||||
out_channels=2,
|
||||
flip_sin_to_cos=True,
|
||||
use_timestep_embedding=False,
|
||||
time_embedding_type="fourier",
|
||||
mid_block_type="UNetMidBlock1D",
|
||||
down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
|
||||
up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
|
||||
)
|
||||
scheduler = IPNDMScheduler()
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"batch_size": 1,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 4,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dance_diffusion(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
pipe = DanceDiffusionPipeline(**components)
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = pipe(**inputs)
|
||||
audio = output.audios
|
||||
|
||||
audio_slice = audio[0, -3:, -3:]
|
||||
|
||||
assert audio.shape == (1, 2, components["unet"].sample_size)
|
||||
expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
|
||||
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@skip_mps
|
||||
def test_save_load_local(self):
|
||||
return super().test_save_load_local()
|
||||
|
||||
@skip_mps
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
return super().test_dict_tuple_outputs_equivalent(expected_max_difference=3e-3)
|
||||
|
||||
@skip_mps
|
||||
def test_save_load_optional_components(self):
|
||||
return super().test_save_load_optional_components()
|
||||
|
||||
@skip_mps
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
return super().test_attention_slicing_forward_pass()
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class PipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_dance_diffusion(self):
|
||||
device = torch_device
|
||||
|
||||
pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
|
||||
audio = output.audios
|
||||
|
||||
audio_slice = audio[0, -3:, -3:]
|
||||
|
||||
assert audio.shape == (1, 2, pipe.unet.config.sample_size)
|
||||
expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
|
||||
|
||||
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_dance_diffusion_fp16(self):
|
||||
device = torch_device
|
||||
|
||||
pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
|
||||
audio = output.audios
|
||||
|
||||
audio_slice = audio[0, -3:, -3:]
|
||||
|
||||
assert audio.shape == (1, 2, pipe.unet.config.sample_size)
|
||||
expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
|
||||
|
||||
assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -1,283 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import (
|
||||
CLIPImageProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionConfig,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
I2VGenXLPipeline,
|
||||
)
|
||||
from diffusers.models.unets import I2VGenXLUNet
|
||||
from diffusers.utils import is_xformers_available, load_image
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
is_torch_version,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
@skip_mps
|
||||
class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = I2VGenXLPipeline
|
||||
params = frozenset(["prompt", "negative_prompt", "image"])
|
||||
batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"])
|
||||
# No `output_type`.
|
||||
required_optional_params = frozenset(["num_inference_steps", "generator", "latents", "return_dict"])
|
||||
|
||||
supports_dduf = False
|
||||
test_layerwise_casting = True
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
unet = I2VGenXLUNet(
|
||||
block_out_channels=(4, 8),
|
||||
layers_per_block=1,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
|
||||
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
|
||||
cross_attention_dim=4,
|
||||
attention_head_dim=4,
|
||||
num_attention_heads=None,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=(8,),
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=32,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=4,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
torch.manual_seed(0)
|
||||
vision_encoder_config = CLIPVisionConfig(
|
||||
hidden_size=4,
|
||||
projection_dim=4,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=2,
|
||||
image_size=32,
|
||||
intermediate_size=16,
|
||||
patch_size=1,
|
||||
)
|
||||
image_encoder = CLIPVisionModelWithProjection(vision_encoder_config)
|
||||
|
||||
torch.manual_seed(0)
|
||||
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"image_encoder": image_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"feature_extractor": feature_extractor,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"image": input_image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "pt",
|
||||
"num_frames": 4,
|
||||
"width": 32,
|
||||
"height": 32,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_text_to_video_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["output_type"] = "np"
|
||||
frames = pipe(**inputs).frames
|
||||
|
||||
image_slice = frames[0][0][-3:, -3:, -1]
|
||||
|
||||
assert frames[0][0].shape == (32, 32, 3)
|
||||
expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@pytest.mark.xfail(
|
||||
condition=is_torch_version(">=", "2.7"),
|
||||
reason="Test currently fails on PyTorch 2.7.",
|
||||
strict=False,
|
||||
)
|
||||
def test_save_load_local(self):
|
||||
super().test_save_load_local(expected_max_difference=0.006)
|
||||
|
||||
def test_sequential_cpu_offload_forward_pass(self):
|
||||
super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008)
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.009)
|
||||
|
||||
def test_save_load_optional_components(self):
|
||||
super().test_save_load_optional_components(expected_max_difference=0.008)
|
||||
|
||||
@unittest.skip("Deprecated functionality")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=0.008)
|
||||
|
||||
def test_model_cpu_offload_forward_pass(self):
|
||||
super().test_model_cpu_offload_forward_pass(expected_max_diff=0.008)
|
||||
|
||||
def test_num_videos_per_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["output_type"] = "np"
|
||||
frames = pipe(**inputs, num_videos_per_prompt=2).frames
|
||||
|
||||
assert frames.shape == (2, 4, 32, 32, 3)
|
||||
assert frames[0][0].shape == (32, 32, 3)
|
||||
|
||||
image_slice = frames[0][0][-3:, -3:, -1]
|
||||
expected_slice = np.array([0.5146, 0.6525, 0.6032, 0.5204, 0.5675, 0.4125, 0.3016, 0.5172, 0.4095])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@unittest.skip("Test not supported for now.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_accelerator
|
||||
class I2VGenXLPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_i2vgen_xl(self):
|
||||
pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
|
||||
pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
|
||||
)
|
||||
|
||||
generator = torch.Generator("cpu").manual_seed(0)
|
||||
num_frames = 3
|
||||
|
||||
output = pipe(
|
||||
image=image,
|
||||
prompt="my cat",
|
||||
num_frames=num_frames,
|
||||
generator=generator,
|
||||
num_inference_steps=3,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.frames[0]
|
||||
assert image.shape == (num_frames, 704, 1280, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892])
|
||||
assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
|
||||
@@ -1,478 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
ClapAudioConfig,
|
||||
ClapConfig,
|
||||
ClapFeatureExtractor,
|
||||
ClapModel,
|
||||
ClapTextConfig,
|
||||
RobertaTokenizer,
|
||||
SpeechT5HifiGan,
|
||||
SpeechT5HifiGanConfig,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
MusicLDMPipeline,
|
||||
PNDMScheduler,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class MusicLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = MusicLDMPipeline
|
||||
params = TEXT_TO_AUDIO_PARAMS
|
||||
batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"num_waveforms_per_prompt",
|
||||
"generator",
|
||||
"latents",
|
||||
"output_type",
|
||||
"return_dict",
|
||||
"callback",
|
||||
"callback_steps",
|
||||
]
|
||||
)
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=(32, 64),
|
||||
class_embed_type="simple_projection",
|
||||
projection_class_embeddings_input_dim=32,
|
||||
class_embeddings_concat=True,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=1,
|
||||
out_channels=1,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_branch_config = ClapTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=16,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
audio_branch_config = ClapAudioConfig(
|
||||
spec_size=64,
|
||||
window_size=4,
|
||||
num_mel_bins=64,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
depths=[2, 2],
|
||||
num_attention_heads=[2, 2],
|
||||
num_hidden_layers=2,
|
||||
hidden_size=192,
|
||||
patch_size=2,
|
||||
patch_stride=2,
|
||||
patch_embed_input_channels=4,
|
||||
)
|
||||
text_encoder_config = ClapConfig.from_text_audio_configs(
|
||||
text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=32
|
||||
)
|
||||
text_encoder = ClapModel(text_encoder_config)
|
||||
tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
|
||||
feature_extractor = ClapFeatureExtractor.from_pretrained(
|
||||
"hf-internal-testing/tiny-random-ClapModel", hop_length=7900
|
||||
)
|
||||
|
||||
torch.manual_seed(0)
|
||||
vocoder_config = SpeechT5HifiGanConfig(
|
||||
model_in_dim=8,
|
||||
sampling_rate=16000,
|
||||
upsample_initial_channel=16,
|
||||
upsample_rates=[2, 2],
|
||||
upsample_kernel_sizes=[4, 4],
|
||||
resblock_kernel_sizes=[3, 7],
|
||||
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
|
||||
normalize_before=False,
|
||||
)
|
||||
|
||||
vocoder = SpeechT5HifiGan(vocoder_config)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"feature_extractor": feature_extractor,
|
||||
"vocoder": vocoder,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A hammer hitting a wooden surface",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_musicldm_ddim(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
components = self.get_dummy_components()
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = musicldm_pipe(**inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 256
|
||||
|
||||
audio_slice = audio[:10]
|
||||
expected_slice = np.array(
|
||||
[-0.0027, -0.0036, -0.0037, -0.0020, -0.0035, -0.0019, -0.0037, -0.0020, -0.0038, -0.0019]
|
||||
)
|
||||
|
||||
assert np.abs(audio_slice - expected_slice).max() < 1e-4
|
||||
|
||||
def test_musicldm_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt"] = 3 * [inputs["prompt"]]
|
||||
|
||||
# forward
|
||||
output = musicldm_pipe(**inputs)
|
||||
audio_1 = output.audios[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = 3 * [inputs.pop("prompt")]
|
||||
|
||||
text_inputs = musicldm_pipe.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=musicldm_pipe.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
prompt_embeds = musicldm_pipe.text_encoder.get_text_features(text_inputs)
|
||||
|
||||
inputs["prompt_embeds"] = prompt_embeds
|
||||
|
||||
# forward
|
||||
output = musicldm_pipe(**inputs)
|
||||
audio_2 = output.audios[0]
|
||||
|
||||
assert np.abs(audio_1 - audio_2).max() < 1e-2
|
||||
|
||||
def test_musicldm_negative_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
negative_prompt = 3 * ["this is a negative prompt"]
|
||||
inputs["negative_prompt"] = negative_prompt
|
||||
inputs["prompt"] = 3 * [inputs["prompt"]]
|
||||
|
||||
# forward
|
||||
output = musicldm_pipe(**inputs)
|
||||
audio_1 = output.audios[0]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = 3 * [inputs.pop("prompt")]
|
||||
|
||||
embeds = []
|
||||
for p in [prompt, negative_prompt]:
|
||||
text_inputs = musicldm_pipe.tokenizer(
|
||||
p,
|
||||
padding="max_length",
|
||||
max_length=musicldm_pipe.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
text_embeds = musicldm_pipe.text_encoder.get_text_features(
|
||||
text_inputs,
|
||||
)
|
||||
embeds.append(text_embeds)
|
||||
|
||||
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
|
||||
|
||||
# forward
|
||||
output = musicldm_pipe(**inputs)
|
||||
audio_2 = output.audios[0]
|
||||
|
||||
assert np.abs(audio_1 - audio_2).max() < 1e-2
|
||||
|
||||
def test_musicldm_negative_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
negative_prompt = "egg cracking"
|
||||
output = musicldm_pipe(**inputs, negative_prompt=negative_prompt)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 256
|
||||
|
||||
audio_slice = audio[:10]
|
||||
expected_slice = np.array(
|
||||
[-0.0027, -0.0036, -0.0037, -0.0019, -0.0035, -0.0018, -0.0037, -0.0021, -0.0038, -0.0018]
|
||||
)
|
||||
|
||||
assert np.abs(audio_slice - expected_slice).max() < 1e-4
|
||||
|
||||
def test_musicldm_num_waveforms_per_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A hammer hitting a wooden surface"
|
||||
|
||||
# test num_waveforms_per_prompt=1 (default)
|
||||
audios = musicldm_pipe(prompt, num_inference_steps=2).audios
|
||||
|
||||
assert audios.shape == (1, 256)
|
||||
|
||||
# test num_waveforms_per_prompt=1 (default) for batch of prompts
|
||||
batch_size = 2
|
||||
audios = musicldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
|
||||
|
||||
assert audios.shape == (batch_size, 256)
|
||||
|
||||
# test num_waveforms_per_prompt for single prompt
|
||||
num_waveforms_per_prompt = 2
|
||||
audios = musicldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
|
||||
|
||||
assert audios.shape == (num_waveforms_per_prompt, 256)
|
||||
|
||||
# test num_waveforms_per_prompt for batch of prompts
|
||||
batch_size = 2
|
||||
audios = musicldm_pipe(
|
||||
[prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
|
||||
).audios
|
||||
|
||||
assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
|
||||
|
||||
def test_musicldm_audio_length_in_s(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
vocoder_sampling_rate = musicldm_pipe.vocoder.config.sampling_rate
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = musicldm_pipe(audio_length_in_s=0.016, **inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) / vocoder_sampling_rate == 0.016
|
||||
|
||||
output = musicldm_pipe(audio_length_in_s=0.032, **inputs)
|
||||
audio = output.audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) / vocoder_sampling_rate == 0.032
|
||||
|
||||
def test_musicldm_vocoder_model_in_dim(self):
|
||||
components = self.get_dummy_components()
|
||||
musicldm_pipe = MusicLDMPipeline(**components)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = ["hey"]
|
||||
|
||||
output = musicldm_pipe(prompt, num_inference_steps=1)
|
||||
audio_shape = output.audios.shape
|
||||
assert audio_shape == (1, 256)
|
||||
|
||||
config = musicldm_pipe.vocoder.config
|
||||
config.model_in_dim *= 2
|
||||
musicldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
|
||||
output = musicldm_pipe(prompt, num_inference_steps=1)
|
||||
audio_shape = output.audios.shape
|
||||
# waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
|
||||
assert audio_shape == (1, 256)
|
||||
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical()
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
|
||||
|
||||
def test_to_dtype(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# The method component.dtype returns the dtype of the first parameter registered in the model, not the
|
||||
# dtype of the entire model. In the case of CLAP, the first parameter is a float64 constant (logit scale)
|
||||
model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
|
||||
|
||||
# Without the logit scale parameters, everything is float32
|
||||
model_dtypes.pop("text_encoder")
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
|
||||
|
||||
# the CLAP sub-models are float32
|
||||
model_dtypes["clap_text_branch"] = components["text_encoder"].text_model.dtype
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes.values()))
|
||||
|
||||
# Once we send to fp16, all params are in half-precision, including the logit scale
|
||||
pipe.to(dtype=torch.float16)
|
||||
model_dtypes = {key: component.dtype for key, component in components.items() if hasattr(component, "dtype")}
|
||||
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes.values()))
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class MusicLDMPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
|
||||
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
|
||||
inputs = {
|
||||
"prompt": "A hammer hitting a wooden surface",
|
||||
"latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 2.5,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_musicldm(self):
|
||||
musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
inputs["num_inference_steps"] = 25
|
||||
audio = musicldm_pipe(**inputs).audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 81952
|
||||
|
||||
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
|
||||
audio_slice = audio[8680:8690]
|
||||
expected_slice = np.array(
|
||||
[-0.1042, -0.1068, -0.1235, -0.1387, -0.1428, -0.136, -0.1213, -0.1097, -0.0967, -0.0945]
|
||||
)
|
||||
max_diff = np.abs(expected_slice - audio_slice).max()
|
||||
assert max_diff < 1e-3
|
||||
|
||||
def test_musicldm_lms(self):
|
||||
musicldm_pipe = MusicLDMPipeline.from_pretrained("cvssp/musicldm")
|
||||
musicldm_pipe.scheduler = LMSDiscreteScheduler.from_config(musicldm_pipe.scheduler.config)
|
||||
musicldm_pipe = musicldm_pipe.to(torch_device)
|
||||
musicldm_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
audio = musicldm_pipe(**inputs).audios[0]
|
||||
|
||||
assert audio.ndim == 1
|
||||
assert len(audio) == 81952
|
||||
|
||||
# check the portion of the generated audio with the largest dynamic range (reduces flakiness)
|
||||
audio_slice = audio[58020:58030]
|
||||
expected_slice = np.array([0.3592, 0.3477, 0.4084, 0.4665, 0.5048, 0.5891, 0.6461, 0.5579, 0.4595, 0.4403])
|
||||
max_diff = np.abs(expected_slice - audio_slice).max()
|
||||
assert max_diff < 1e-3
|
||||
@@ -1,229 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPImageProcessor, CLIPVisionConfig
|
||||
|
||||
from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
|
||||
from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = PaintByExamplePipeline
|
||||
params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
|
||||
batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
|
||||
image_params = frozenset([]) # TO_DO: update the image_prams once refactored VaeImageProcessor.preprocess
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=9,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
scheduler = PNDMScheduler(skip_prk_steps=True)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
config = CLIPVisionConfig(
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
image_size=32,
|
||||
patch_size=4,
|
||||
)
|
||||
image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
|
||||
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"image_encoder": image_encoder,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": feature_extractor,
|
||||
}
|
||||
return components
|
||||
|
||||
def convert_to_pt(self, image):
|
||||
image = np.array(image.convert("RGB"))
|
||||
image = image[None].transpose(0, 3, 1, 2)
|
||||
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
|
||||
return image
|
||||
|
||||
def get_dummy_inputs(self, device="cpu", seed=0):
|
||||
# TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
|
||||
mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
|
||||
example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"example_image": example_image,
|
||||
"image": init_image,
|
||||
"mask_image": mask_image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_paint_by_example_inpaint(self):
|
||||
components = self.get_dummy_components()
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
pipe = PaintByExamplePipeline(**components)
|
||||
pipe = pipe.to("cpu")
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
output = pipe(**inputs)
|
||||
image = output.images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.4686, 0.5687, 0.4007, 0.5218, 0.5741, 0.4482, 0.4940, 0.4629, 0.4503])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_paint_by_example_image_tensor(self):
|
||||
device = "cpu"
|
||||
inputs = self.get_dummy_inputs()
|
||||
inputs.pop("mask_image")
|
||||
image = self.convert_to_pt(inputs.pop("image"))
|
||||
mask_image = image.clamp(0, 1) / 2
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
pipe = PaintByExamplePipeline(**self.get_dummy_components())
|
||||
pipe = pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
|
||||
out_1 = output.images
|
||||
|
||||
image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
|
||||
|
||||
image = Image.fromarray(np.uint8(image)).convert("RGB")
|
||||
mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
|
||||
|
||||
output = pipe(**self.get_dummy_inputs())
|
||||
out_2 = output.images
|
||||
|
||||
assert out_1.shape == (1, 64, 64, 3)
|
||||
assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_paint_by_example(self):
|
||||
# make sure here that pndm scheduler skips prk
|
||||
init_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/paint_by_example/dog_in_bucket.png"
|
||||
)
|
||||
mask_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/paint_by_example/mask.png"
|
||||
)
|
||||
example_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/paint_by_example/panda.jpg"
|
||||
)
|
||||
|
||||
pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.manual_seed(321)
|
||||
output = pipe(
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
example_image=example_image,
|
||||
generator=generator,
|
||||
guidance_scale=5.0,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -1,448 +0,0 @@
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
import diffusers
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
LCMScheduler,
|
||||
MotionAdapter,
|
||||
PIAPipeline,
|
||||
StableDiffusionPipeline,
|
||||
UNet2DConditionModel,
|
||||
UNetMotionModel,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available, logging
|
||||
from diffusers.utils.testing_utils import floats_tensor, require_accelerator, torch_device
|
||||
|
||||
from ..test_pipelines_common import IPAdapterTesterMixin, PipelineFromPipeTesterMixin, PipelineTesterMixin
|
||||
|
||||
|
||||
def to_np(tensor):
|
||||
if isinstance(tensor, torch.Tensor):
|
||||
tensor = tensor.detach().cpu().numpy()
|
||||
|
||||
return tensor
|
||||
|
||||
|
||||
class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
|
||||
pipeline_class = PIAPipeline
|
||||
params = frozenset(
|
||||
[
|
||||
"prompt",
|
||||
"height",
|
||||
"width",
|
||||
"guidance_scale",
|
||||
"negative_prompt",
|
||||
"prompt_embeds",
|
||||
"negative_prompt_embeds",
|
||||
"cross_attention_kwargs",
|
||||
]
|
||||
)
|
||||
batch_params = frozenset(["prompt", "image", "generator"])
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback_on_step_end",
|
||||
"callback_on_step_end_tensor_inputs",
|
||||
]
|
||||
)
|
||||
test_layerwise_casting = True
|
||||
test_group_offloading = True
|
||||
|
||||
def get_dummy_components(self):
|
||||
cross_attention_dim = 8
|
||||
block_out_channels = (8, 8)
|
||||
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=block_out_channels,
|
||||
layers_per_block=2,
|
||||
sample_size=8,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=cross_attention_dim,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="linear",
|
||||
clip_sample=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=block_out_channels,
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=cross_attention_dim,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
torch.manual_seed(0)
|
||||
motion_adapter = MotionAdapter(
|
||||
block_out_channels=block_out_channels,
|
||||
motion_layers_per_block=2,
|
||||
motion_norm_num_groups=2,
|
||||
motion_num_attention_heads=4,
|
||||
conv_in_channels=9,
|
||||
)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"motion_adapter": motion_adapter,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
image = floats_tensor((1, 3, 8, 8), rng=random.Random(seed)).to(device)
|
||||
inputs = {
|
||||
"image": image,
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 7.5,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_from_pipe_consistent_config(self):
|
||||
assert self.original_pipeline_class == StableDiffusionPipeline
|
||||
original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
|
||||
original_kwargs = {"requires_safety_checker": False}
|
||||
|
||||
# create original_pipeline_class(sd)
|
||||
pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
|
||||
|
||||
# original_pipeline_class(sd) -> pipeline_class
|
||||
pipe_components = self.get_dummy_components()
|
||||
pipe_additional_components = {}
|
||||
for name, component in pipe_components.items():
|
||||
if name not in pipe_original.components:
|
||||
pipe_additional_components[name] = component
|
||||
|
||||
pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
|
||||
|
||||
# pipeline_class -> original_pipeline_class(sd)
|
||||
original_pipe_additional_components = {}
|
||||
for name, component in pipe_original.components.items():
|
||||
if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
|
||||
original_pipe_additional_components[name] = component
|
||||
|
||||
pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
|
||||
|
||||
# compare the config
|
||||
original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
|
||||
original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
|
||||
assert original_config_2 == original_config
|
||||
|
||||
def test_motion_unet_loading(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
|
||||
assert isinstance(pipe.unet, UNetMotionModel)
|
||||
|
||||
def test_ip_adapter(self):
|
||||
expected_pipe_slice = None
|
||||
|
||||
if torch_device == "cpu":
|
||||
expected_pipe_slice = np.array(
|
||||
[
|
||||
0.5475,
|
||||
0.5769,
|
||||
0.4873,
|
||||
0.5064,
|
||||
0.4445,
|
||||
0.5876,
|
||||
0.5453,
|
||||
0.4102,
|
||||
0.5247,
|
||||
0.5370,
|
||||
0.3406,
|
||||
0.4322,
|
||||
0.3991,
|
||||
0.3756,
|
||||
0.5438,
|
||||
0.4780,
|
||||
0.5087,
|
||||
0.5248,
|
||||
0.6243,
|
||||
0.5506,
|
||||
0.3491,
|
||||
0.5440,
|
||||
0.6111,
|
||||
0.5122,
|
||||
0.5326,
|
||||
0.5180,
|
||||
0.5538,
|
||||
]
|
||||
)
|
||||
return super().test_ip_adapter(expected_pipe_slice=expected_pipe_slice)
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.5476, 0.4092, 0.5289, 0.4755, 0.5092, 0.5186, 0.5403, 0.5287, 0.5467])
|
||||
return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
|
||||
@unittest.skip("Attention slicing is not enabled in this pipeline")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_inference_batch_single_identical(
|
||||
self,
|
||||
batch_size=2,
|
||||
expected_max_diff=1e-4,
|
||||
additional_params_copy_to_batched_inputs=["num_inference_steps"],
|
||||
):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for components in pipe.components.values():
|
||||
if hasattr(components, "set_default_attn_processor"):
|
||||
components.set_default_attn_processor()
|
||||
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
# Reset generator in case it is has been used in self.get_dummy_inputs
|
||||
inputs["generator"] = self.get_generator(0)
|
||||
|
||||
logger = logging.get_logger(pipe.__module__)
|
||||
logger.setLevel(level=diffusers.logging.FATAL)
|
||||
|
||||
# batchify inputs
|
||||
batched_inputs = {}
|
||||
batched_inputs.update(inputs)
|
||||
|
||||
for name in self.batch_params:
|
||||
if name not in inputs:
|
||||
continue
|
||||
|
||||
value = inputs[name]
|
||||
if name == "prompt":
|
||||
len_prompt = len(value)
|
||||
batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
|
||||
batched_inputs[name][-1] = 100 * "very long"
|
||||
|
||||
else:
|
||||
batched_inputs[name] = batch_size * [value]
|
||||
|
||||
if "generator" in inputs:
|
||||
batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
|
||||
|
||||
if "batch_size" in inputs:
|
||||
batched_inputs["batch_size"] = batch_size
|
||||
|
||||
for arg in additional_params_copy_to_batched_inputs:
|
||||
batched_inputs[arg] = inputs[arg]
|
||||
|
||||
output = pipe(**inputs)
|
||||
output_batch = pipe(**batched_inputs)
|
||||
|
||||
assert output_batch[0].shape[0] == batch_size
|
||||
|
||||
max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
|
||||
assert max_diff < expected_max_diff
|
||||
|
||||
@require_accelerator
|
||||
def test_to_device(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.to("cpu")
|
||||
# pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == "cpu" for device in model_devices))
|
||||
|
||||
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
|
||||
self.assertTrue(np.isnan(output_cpu).sum() == 0)
|
||||
|
||||
pipe.to(torch_device)
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
self.assertTrue(all(device == torch_device for device in model_devices))
|
||||
|
||||
output_device = pipe(**self.get_dummy_inputs(torch_device))[0]
|
||||
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
|
||||
|
||||
def test_to_dtype(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
|
||||
|
||||
pipe.to(dtype=torch.float16)
|
||||
model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
|
||||
self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
|
||||
|
||||
def test_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs.pop("prompt")
|
||||
inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
|
||||
pipe(**inputs)
|
||||
|
||||
def test_free_init(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device)
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
pipe.enable_free_init(
|
||||
num_iters=2,
|
||||
use_fast_sampling=True,
|
||||
method="butterworth",
|
||||
order=4,
|
||||
spatial_stop_frequency=0.25,
|
||||
temporal_stop_frequency=0.25,
|
||||
)
|
||||
inputs_enable_free_init = self.get_dummy_inputs(torch_device)
|
||||
frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
|
||||
|
||||
pipe.disable_free_init()
|
||||
inputs_disable_free_init = self.get_dummy_inputs(torch_device)
|
||||
frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
|
||||
|
||||
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
|
||||
max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
|
||||
self.assertGreater(
|
||||
sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
|
||||
)
|
||||
self.assertLess(
|
||||
max_diff_disabled,
|
||||
1e-4,
|
||||
"Disabling of FreeInit should lead to results similar to the default pipeline results",
|
||||
)
|
||||
|
||||
def test_free_init_with_schedulers(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe: PIAPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
inputs_normal = self.get_dummy_inputs(torch_device)
|
||||
frames_normal = pipe(**inputs_normal).frames[0]
|
||||
|
||||
schedulers_to_test = [
|
||||
DPMSolverMultistepScheduler.from_config(
|
||||
components["scheduler"].config,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
algorithm_type="dpmsolver++",
|
||||
steps_offset=1,
|
||||
clip_sample=False,
|
||||
),
|
||||
LCMScheduler.from_config(
|
||||
components["scheduler"].config,
|
||||
timestep_spacing="linspace",
|
||||
beta_schedule="linear",
|
||||
steps_offset=1,
|
||||
clip_sample=False,
|
||||
),
|
||||
]
|
||||
components.pop("scheduler")
|
||||
|
||||
for scheduler in schedulers_to_test:
|
||||
components["scheduler"] = scheduler
|
||||
pipe: PIAPipeline = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
|
||||
pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
frames_enable_free_init = pipe(**inputs).frames[0]
|
||||
sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
|
||||
|
||||
self.assertGreater(
|
||||
sum_enabled,
|
||||
1e1,
|
||||
"Enabling of FreeInit should lead to results different from the default pipeline results",
|
||||
)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
for component in pipe.components.values():
|
||||
if hasattr(component, "set_default_attn_processor"):
|
||||
component.set_default_attn_processor()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_without_offload = pipe(**inputs).frames[0]
|
||||
output_without_offload = (
|
||||
output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
|
||||
)
|
||||
|
||||
pipe.enable_xformers_memory_efficient_attention()
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_with_offload = pipe(**inputs).frames[0]
|
||||
output_with_offload = (
|
||||
output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
|
||||
)
|
||||
|
||||
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
|
||||
self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"num_images_per_prompt": 1,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
@@ -1,617 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
|
||||
from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class SafeDiffusionPipelineFastTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@property
|
||||
def dummy_image(self):
|
||||
batch_size = 1
|
||||
num_channels = 3
|
||||
sizes = (32, 32)
|
||||
|
||||
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
|
||||
return image
|
||||
|
||||
@property
|
||||
def dummy_cond_unet(self):
|
||||
torch.manual_seed(0)
|
||||
model = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_vae(self):
|
||||
torch.manual_seed(0)
|
||||
model = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config)
|
||||
|
||||
@property
|
||||
def dummy_extractor(self):
|
||||
def extract(*args, **kwargs):
|
||||
class Out:
|
||||
def __init__(self):
|
||||
self.pixel_values = torch.ones([0])
|
||||
|
||||
def to(self, device):
|
||||
self.pixel_values.to(device)
|
||||
return self
|
||||
|
||||
return Out()
|
||||
|
||||
return extract
|
||||
|
||||
def test_semantic_diffusion_ddim(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
|
||||
image = output.images
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image_from_tuple = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5753, 0.6114, 0.5001, 0.5034, 0.5470, 0.4729, 0.4971, 0.4867, 0.4867])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_semantic_diffusion_pndm(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = PNDMScheduler(skip_prk_steps=True)
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
|
||||
|
||||
image = output.images
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image_from_tuple = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5122, 0.5712, 0.4825, 0.5053, 0.5646, 0.4769, 0.5179, 0.4894, 0.4994])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_semantic_diffusion_no_safety_checker(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
|
||||
)
|
||||
assert isinstance(pipe, StableDiffusionPipeline)
|
||||
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
|
||||
assert pipe.safety_checker is None
|
||||
|
||||
image = pipe("example prompt", num_inference_steps=2).images[0]
|
||||
assert image is not None
|
||||
|
||||
# check that there's no error when saving a pipeline with one of the models being None
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
pipe.save_pretrained(tmpdirname)
|
||||
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
|
||||
|
||||
# sanity check that the pipeline still works
|
||||
assert pipe.safety_checker is None
|
||||
image = pipe("example prompt", num_inference_steps=2).images[0]
|
||||
assert image is not None
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_semantic_diffusion_fp16(self):
|
||||
"""Test that stable diffusion works with fp16"""
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = PNDMScheduler(skip_prk_steps=True)
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# put models in fp16
|
||||
unet = unet.half()
|
||||
vae = vae.half()
|
||||
bert = bert.half()
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_positive_guidance(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "a photo of a cat"
|
||||
edit = {
|
||||
"editing_prompt": ["sunglasses"],
|
||||
"reverse_editing_direction": [False],
|
||||
"edit_warmup_steps": 10,
|
||||
"edit_guidance_scale": 6,
|
||||
"edit_threshold": 0.95,
|
||||
"edit_momentum_scale": 0.5,
|
||||
"edit_mom_beta": 0.6,
|
||||
}
|
||||
|
||||
seed = 3
|
||||
guidance_scale = 7
|
||||
|
||||
# no sega enabled
|
||||
generator = torch.Generator(torch_device)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.34673113,
|
||||
0.38492733,
|
||||
0.37597352,
|
||||
0.34086335,
|
||||
0.35650748,
|
||||
0.35579205,
|
||||
0.3384763,
|
||||
0.34340236,
|
||||
0.3573271,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# with sega enabled
|
||||
# generator = torch.manual_seed(seed)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
**edit,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.41887826,
|
||||
0.37728766,
|
||||
0.30138272,
|
||||
0.41416335,
|
||||
0.41664985,
|
||||
0.36283392,
|
||||
0.36191246,
|
||||
0.43364465,
|
||||
0.43001732,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_negative_guidance(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "an image of a crowded boulevard, realistic, 4k"
|
||||
edit = {
|
||||
"editing_prompt": "crowd, crowded, people",
|
||||
"reverse_editing_direction": True,
|
||||
"edit_warmup_steps": 10,
|
||||
"edit_guidance_scale": 8.3,
|
||||
"edit_threshold": 0.9,
|
||||
"edit_momentum_scale": 0.5,
|
||||
"edit_mom_beta": 0.6,
|
||||
}
|
||||
|
||||
seed = 9
|
||||
guidance_scale = 7
|
||||
|
||||
# no sega enabled
|
||||
generator = torch.Generator(torch_device)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.43497998,
|
||||
0.91814065,
|
||||
0.7540739,
|
||||
0.55580205,
|
||||
0.8467265,
|
||||
0.5389691,
|
||||
0.62574506,
|
||||
0.58897763,
|
||||
0.50926757,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# with sega enabled
|
||||
# generator = torch.manual_seed(seed)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
**edit,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.3089719,
|
||||
0.30500144,
|
||||
0.29016042,
|
||||
0.30630964,
|
||||
0.325687,
|
||||
0.29419225,
|
||||
0.2908091,
|
||||
0.28723598,
|
||||
0.27696294,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_multi_cond_guidance(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "a castle next to a river"
|
||||
edit = {
|
||||
"editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
|
||||
"reverse_editing_direction": False,
|
||||
"edit_warmup_steps": [15, 18],
|
||||
"edit_guidance_scale": 6,
|
||||
"edit_threshold": [0.9, 0.8],
|
||||
"edit_momentum_scale": 0.5,
|
||||
"edit_mom_beta": 0.6,
|
||||
}
|
||||
|
||||
seed = 48
|
||||
guidance_scale = 7
|
||||
|
||||
# no sega enabled
|
||||
generator = torch.Generator(torch_device)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.75163555,
|
||||
0.76037145,
|
||||
0.61785,
|
||||
0.9189673,
|
||||
0.8627701,
|
||||
0.85189694,
|
||||
0.8512813,
|
||||
0.87012076,
|
||||
0.8312857,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# with sega enabled
|
||||
# generator = torch.manual_seed(seed)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
**edit,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.73553365,
|
||||
0.7537271,
|
||||
0.74341905,
|
||||
0.66480356,
|
||||
0.6472925,
|
||||
0.63039416,
|
||||
0.64812905,
|
||||
0.6749717,
|
||||
0.6517102,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_guidance_fp16(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16
|
||||
)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "a photo of a cat"
|
||||
edit = {
|
||||
"editing_prompt": ["sunglasses"],
|
||||
"reverse_editing_direction": [False],
|
||||
"edit_warmup_steps": 10,
|
||||
"edit_guidance_scale": 6,
|
||||
"edit_threshold": 0.95,
|
||||
"edit_momentum_scale": 0.5,
|
||||
"edit_mom_beta": 0.6,
|
||||
}
|
||||
|
||||
seed = 3
|
||||
guidance_scale = 7
|
||||
|
||||
# no sega enabled
|
||||
generator = torch.Generator(torch_device)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.34887695,
|
||||
0.3876953,
|
||||
0.375,
|
||||
0.34423828,
|
||||
0.3581543,
|
||||
0.35717773,
|
||||
0.3383789,
|
||||
0.34570312,
|
||||
0.359375,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# with sega enabled
|
||||
# generator = torch.manual_seed(seed)
|
||||
generator.manual_seed(seed)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
**edit,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [
|
||||
0.42285156,
|
||||
0.36914062,
|
||||
0.29077148,
|
||||
0.42041016,
|
||||
0.41918945,
|
||||
0.35498047,
|
||||
0.3618164,
|
||||
0.4423828,
|
||||
0.43115234,
|
||||
]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -1,267 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
StableDiffusionAttendAndExcitePipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
load_numpy,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
PipelineFromPipeTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = False
|
||||
|
||||
|
||||
@skip_mps
|
||||
class StableDiffusionAttendAndExcitePipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionAttendAndExcitePipeline
|
||||
test_attention_slicing = False
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
# Attend and excite requires being able to run a backward pass at
|
||||
# inference time. There's no deterministic backward operator for pad
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
torch.use_deterministic_algorithms(False)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
super().tearDownClass()
|
||||
torch.use_deterministic_algorithms(True)
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=1,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
# SD2-specific config below
|
||||
attention_head_dim=(2, 4),
|
||||
use_linear_projection=True,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
# SD2-specific config below
|
||||
hidden_act="gelu",
|
||||
projection_dim=512,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "a cat and a frog",
|
||||
"token_indices": [2, 5],
|
||||
"generator": generator,
|
||||
"num_inference_steps": 1,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
"max_iter_to_alter": 2,
|
||||
"thresholds": {0: 0.7},
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.6391, 0.6290, 0.4860, 0.5134, 0.5550, 0.4577, 0.5033, 0.5023, 0.4538])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice, expected_max_difference=3e-3)
|
||||
|
||||
def test_inference(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
self.assertEqual(image.shape, (1, 64, 64, 3))
|
||||
expected_slice = np.array(
|
||||
[0.63905364, 0.62897307, 0.48599017, 0.5133624, 0.5550048, 0.45769516, 0.50326973, 0.5023139, 0.45384496]
|
||||
)
|
||||
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
|
||||
self.assertLessEqual(max_diff, 1e-3)
|
||||
|
||||
def test_sequential_cpu_offload_forward_pass(self):
|
||||
super().test_sequential_cpu_offload_forward_pass(expected_max_diff=5e-4)
|
||||
|
||||
def test_inference_batch_consistent(self):
|
||||
# NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
|
||||
self._test_inference_batch_consistent(batch_sizes=[1, 2])
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(batch_size=2, expected_max_diff=7e-4)
|
||||
|
||||
def test_pt_np_pil_outputs_equivalent(self):
|
||||
super().test_pt_np_pil_outputs_equivalent(expected_max_diff=5e-4)
|
||||
|
||||
def test_save_load_local(self):
|
||||
super().test_save_load_local(expected_max_difference=5e-4)
|
||||
|
||||
def test_save_load_optional_components(self):
|
||||
super().test_save_load_optional_components(expected_max_difference=4e-4)
|
||||
|
||||
def test_karras_schedulers_shape(self):
|
||||
super().test_karras_schedulers_shape(num_inference_steps_for_strength_for_iterations=3)
|
||||
|
||||
def test_from_pipe_consistent_forward_pass_cpu_offload(self):
|
||||
super().test_from_pipe_consistent_forward_pass_cpu_offload(expected_max_diff=5e-3)
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@nightly
|
||||
class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
|
||||
# Attend and excite requires being able to run a backward pass at
|
||||
# inference time. There's no deterministic backward operator for pad
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
torch.use_deterministic_algorithms(False)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
super().tearDownClass()
|
||||
torch.use_deterministic_algorithms(True)
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_attend_and_excite_fp16(self):
|
||||
generator = torch.manual_seed(51)
|
||||
|
||||
pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.to(torch_device)
|
||||
|
||||
prompt = "a painting of an elephant with glasses"
|
||||
token_indices = [5, 7]
|
||||
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
token_indices=token_indices,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
num_inference_steps=5,
|
||||
max_iter_to_alter=5,
|
||||
output_type="np",
|
||||
).images[0]
|
||||
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
|
||||
)
|
||||
max_diff = numpy_cosine_similarity_distance(image.flatten(), expected_image.flatten())
|
||||
assert max_diff < 5e-1
|
||||
@@ -1,452 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMInverseScheduler,
|
||||
DDIMScheduler,
|
||||
DPMSolverMultistepInverseScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
StableDiffusionDiffEditPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
|
||||
from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class StableDiffusionDiffEditPipelineFastTests(
|
||||
PipelineLatentTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
|
||||
):
|
||||
pipeline_class = StableDiffusionDiffEditPipeline
|
||||
params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - {"height", "width", "image"} | {"image_latents"}
|
||||
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - {"image"} | {"image_latents"}
|
||||
image_params = frozenset(
|
||||
[]
|
||||
) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
|
||||
image_latents_params = frozenset([])
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
# SD2-specific config below
|
||||
attention_head_dim=(2, 4),
|
||||
use_linear_projection=True,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
inverse_scheduler = DDIMInverseScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_zero=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
# SD2-specific config below
|
||||
hidden_act="gelu",
|
||||
projection_dim=512,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"inverse_scheduler": inverse_scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
mask = floats_tensor((1, 16, 16), rng=random.Random(seed)).to(device)
|
||||
latents = floats_tensor((1, 2, 4, 16, 16), rng=random.Random(seed)).to(device)
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "a dog and a newt",
|
||||
"mask_image": mask,
|
||||
"image_latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"inpaint_strength": 1.0,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
|
||||
return inputs
|
||||
|
||||
def get_dummy_mask_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
image = Image.fromarray(np.uint8(image)).convert("RGB")
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"image": image,
|
||||
"source_prompt": "a cat and a frog",
|
||||
"target_prompt": "a dog and a newt",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"num_maps_per_mask": 2,
|
||||
"mask_encode_strength": 1.0,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
|
||||
return inputs
|
||||
|
||||
def get_dummy_inversion_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
image = Image.fromarray(np.uint8(image)).convert("RGB")
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"image": image,
|
||||
"prompt": "a cat and a frog",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"inpaint_strength": 1.0,
|
||||
"guidance_scale": 6.0,
|
||||
"decode_latents": True,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_save_load_optional_components(self):
|
||||
if not hasattr(self.pipeline_class, "_optional_components"):
|
||||
return
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# set all optional components to None and update pipeline config accordingly
|
||||
for optional_component in pipe._optional_components:
|
||||
setattr(pipe, optional_component, None)
|
||||
pipe.register_modules(**dict.fromkeys(pipe._optional_components))
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output = pipe(**inputs)[0]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pipe.save_pretrained(tmpdir)
|
||||
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
|
||||
pipe_loaded.to(torch_device)
|
||||
pipe_loaded.set_progress_bar_config(disable=None)
|
||||
|
||||
for optional_component in pipe._optional_components:
|
||||
self.assertTrue(
|
||||
getattr(pipe_loaded, optional_component) is None,
|
||||
f"`{optional_component}` did not stay set to None after loading.",
|
||||
)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
output_loaded = pipe_loaded(**inputs)[0]
|
||||
|
||||
max_diff = np.abs(output - output_loaded).max()
|
||||
self.assertLess(max_diff, 1e-4)
|
||||
|
||||
def test_mask(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_mask_inputs(device)
|
||||
mask = pipe.generate_mask(**inputs)
|
||||
mask_slice = mask[0, -3:, -3:]
|
||||
|
||||
self.assertEqual(mask.shape, (1, 16, 16))
|
||||
expected_slice = np.array([0] * 9)
|
||||
max_diff = np.abs(mask_slice.flatten() - expected_slice).max()
|
||||
self.assertLessEqual(max_diff, 1e-3)
|
||||
self.assertEqual(mask[0, -3, -4], 0)
|
||||
|
||||
def test_inversion(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inversion_inputs(device)
|
||||
image = pipe.invert(**inputs).images
|
||||
image_slice = image[0, -1, -3:, -3:]
|
||||
|
||||
self.assertEqual(image.shape, (2, 32, 32, 3))
|
||||
expected_slice = np.array(
|
||||
[0.5160, 0.5115, 0.5060, 0.5456, 0.4704, 0.5060, 0.5019, 0.4405, 0.4726],
|
||||
)
|
||||
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
|
||||
self.assertLessEqual(max_diff, 1e-3)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=5e-3)
|
||||
|
||||
def test_inversion_dpm(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
scheduler_args = {"beta_start": 0.00085, "beta_end": 0.012, "beta_schedule": "scaled_linear"}
|
||||
components["scheduler"] = DPMSolverMultistepScheduler(**scheduler_args)
|
||||
components["inverse_scheduler"] = DPMSolverMultistepInverseScheduler(**scheduler_args)
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inversion_inputs(device)
|
||||
image = pipe.invert(**inputs).images
|
||||
image_slice = image[0, -1, -3:, -3:]
|
||||
|
||||
self.assertEqual(image.shape, (2, 32, 32, 3))
|
||||
expected_slice = np.array(
|
||||
[0.5305, 0.4673, 0.5314, 0.5308, 0.4886, 0.5279, 0.5142, 0.4724, 0.4892],
|
||||
)
|
||||
max_diff = np.abs(image_slice.flatten() - expected_slice).max()
|
||||
self.assertLessEqual(max_diff, 1e-3)
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@require_torch_accelerator
|
||||
@nightly
|
||||
class StableDiffusionDiffEditPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
raw_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
|
||||
)
|
||||
raw_image = raw_image.convert("RGB").resize((256, 256))
|
||||
|
||||
cls.raw_image = raw_image
|
||||
|
||||
def test_stable_diffusion_diffedit_full(self):
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
pipe = StableDiffusionDiffEditPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1-base", safety_checker=None, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.scheduler.clip_sample = True
|
||||
|
||||
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
source_prompt = "a bowl of fruit"
|
||||
target_prompt = "a bowl of pears"
|
||||
|
||||
mask_image = pipe.generate_mask(
|
||||
image=self.raw_image,
|
||||
source_prompt=source_prompt,
|
||||
target_prompt=target_prompt,
|
||||
generator=generator,
|
||||
)
|
||||
|
||||
inv_latents = pipe.invert(
|
||||
prompt=source_prompt,
|
||||
image=self.raw_image,
|
||||
inpaint_strength=0.7,
|
||||
generator=generator,
|
||||
num_inference_steps=5,
|
||||
).latents
|
||||
|
||||
image = pipe(
|
||||
prompt=target_prompt,
|
||||
mask_image=mask_image,
|
||||
image_latents=inv_latents,
|
||||
generator=generator,
|
||||
negative_prompt=source_prompt,
|
||||
inpaint_strength=0.7,
|
||||
num_inference_steps=5,
|
||||
output_type="np",
|
||||
).images[0]
|
||||
|
||||
expected_image = (
|
||||
np.array(
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/diffedit/pears.png"
|
||||
).resize((256, 256))
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
|
||||
assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 2e-1
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionDiffEditPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
raw_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png"
|
||||
)
|
||||
|
||||
raw_image = raw_image.convert("RGB").resize((768, 768))
|
||||
|
||||
cls.raw_image = raw_image
|
||||
|
||||
def test_stable_diffusion_diffedit_dpm(self):
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
pipe = StableDiffusionDiffEditPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.inverse_scheduler = DPMSolverMultistepInverseScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
source_prompt = "a bowl of fruit"
|
||||
target_prompt = "a bowl of pears"
|
||||
|
||||
mask_image = pipe.generate_mask(
|
||||
image=self.raw_image,
|
||||
source_prompt=source_prompt,
|
||||
target_prompt=target_prompt,
|
||||
generator=generator,
|
||||
)
|
||||
|
||||
inv_latents = pipe.invert(
|
||||
prompt=source_prompt,
|
||||
image=self.raw_image,
|
||||
inpaint_strength=0.7,
|
||||
generator=generator,
|
||||
num_inference_steps=25,
|
||||
).latents
|
||||
|
||||
image = pipe(
|
||||
prompt=target_prompt,
|
||||
mask_image=mask_image,
|
||||
image_latents=inv_latents,
|
||||
generator=generator,
|
||||
negative_prompt=source_prompt,
|
||||
inpaint_strength=0.7,
|
||||
num_inference_steps=25,
|
||||
output_type="np",
|
||||
).images[0]
|
||||
|
||||
expected_image = (
|
||||
np.array(
|
||||
load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/diffedit/pears.png"
|
||||
).resize((768, 768))
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
assert np.abs((expected_image - image).max()) < 5e-1
|
||||
@@ -1,175 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
StableDiffusionGLIGENPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import enable_full_determinism
|
||||
|
||||
from ..pipeline_params import (
|
||||
TEXT_TO_IMAGE_BATCH_PARAMS,
|
||||
TEXT_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import (
|
||||
PipelineFromPipeTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class GligenPipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionGLIGENPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_boxes"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
attention_type="gated",
|
||||
)
|
||||
# unet.position_net = PositionNet(32,32)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A modern livingroom",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"gligen_phrases": ["a birthday cake"],
|
||||
"gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_stable_diffusion_gligen_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionGLIGENPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_gligen_k_euler_ancestral(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionGLIGENPipeline(**components)
|
||||
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = sd_pipe(**inputs)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
|
||||
|
||||
@unittest.skip("Test not supported as tokenizer is used for parsing bounding boxes.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
@@ -1,215 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
CLIPProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionConfig,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
StableDiffusionGLIGENTextImagePipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.pipelines.stable_diffusion import CLIPImageProjection
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, torch_device
|
||||
|
||||
from ..pipeline_params import (
|
||||
TEXT_TO_IMAGE_BATCH_PARAMS,
|
||||
TEXT_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_TO_IMAGE_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import (
|
||||
PipelineFromPipeTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class GligenTextImagePipelineFastTests(
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineKarrasSchedulerTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionGLIGENTextImagePipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS | {"gligen_phrases", "gligen_images", "gligen_boxes"}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
attention_type="gated-text-image",
|
||||
)
|
||||
# unet.position_net = PositionNet(32,32)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
image_encoder_config = CLIPVisionConfig(
|
||||
hidden_size=32,
|
||||
projection_dim=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
)
|
||||
image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
|
||||
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
||||
|
||||
image_project = CLIPImageProjection(hidden_size=32)
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": image_encoder,
|
||||
"image_project": image_project,
|
||||
"processor": processor,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
gligen_images = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
|
||||
)
|
||||
inputs = {
|
||||
"prompt": "A modern livingroom",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"gligen_phrases": ["a birthday cake"],
|
||||
"gligen_images": [gligen_images],
|
||||
"gligen_boxes": [[0.2676, 0.6088, 0.4773, 0.7183]],
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.5052, 0.5546, 0.4567, 0.4770, 0.5195, 0.4085, 0.5026, 0.4909, 0.4495])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
|
||||
def test_stable_diffusion_gligen_text_image_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5069, 0.5561, 0.4577, 0.4792, 0.5203, 0.4089, 0.5039, 0.4919, 0.4499])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_gligen_k_euler_ancestral(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
|
||||
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(batch_size=3, expected_max_diff=3e-3)
|
||||
|
||||
@unittest.skip(
|
||||
"Test not supported because of the use of `text_encoder` in `get_cross_attention_kwargs_with_grounded()`."
|
||||
)
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
@@ -1,326 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
PNDMScheduler,
|
||||
StableDiffusionLDM3DPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase):
|
||||
pipeline_class = StableDiffusionLDM3DPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=6,
|
||||
out_channels=6,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_stable_diffusion_ddim(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
|
||||
components = self.get_dummy_components()
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
|
||||
ldm3d_pipe = ldm3d_pipe.to(torch_device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb, depth = output.rgb, output.depth
|
||||
|
||||
image_slice_rgb = rgb[0, -3:, -3:, -1]
|
||||
image_slice_depth = depth[0, -3:, -1]
|
||||
|
||||
assert rgb.shape == (1, 64, 64, 3)
|
||||
assert depth.shape == (1, 64, 64)
|
||||
|
||||
expected_slice_rgb = np.array(
|
||||
[0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262]
|
||||
)
|
||||
expected_slice_depth = np.array([103.46727, 85.812004, 87.849236])
|
||||
|
||||
assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2
|
||||
assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_prompt_embeds(self):
|
||||
components = self.get_dummy_components()
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
|
||||
ldm3d_pipe = ldm3d_pipe.to(torch_device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["prompt"] = 3 * [inputs["prompt"]]
|
||||
|
||||
# forward
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb_slice_1, depth_slice_1 = output.rgb, output.depth
|
||||
rgb_slice_1 = rgb_slice_1[0, -3:, -3:, -1]
|
||||
depth_slice_1 = depth_slice_1[0, -3:, -1]
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
prompt = 3 * [inputs.pop("prompt")]
|
||||
|
||||
text_inputs = ldm3d_pipe.tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=ldm3d_pipe.tokenizer.model_max_length,
|
||||
truncation=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_inputs = text_inputs["input_ids"].to(torch_device)
|
||||
|
||||
prompt_embeds = ldm3d_pipe.text_encoder(text_inputs)[0]
|
||||
|
||||
inputs["prompt_embeds"] = prompt_embeds
|
||||
|
||||
# forward
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb_slice_2, depth_slice_2 = output.rgb, output.depth
|
||||
rgb_slice_2 = rgb_slice_2[0, -3:, -3:, -1]
|
||||
depth_slice_2 = depth_slice_2[0, -3:, -1]
|
||||
|
||||
assert np.abs(rgb_slice_1.flatten() - rgb_slice_2.flatten()).max() < 1e-4
|
||||
assert np.abs(depth_slice_1.flatten() - depth_slice_2.flatten()).max() < 1e-4
|
||||
|
||||
def test_stable_diffusion_negative_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline(**components)
|
||||
ldm3d_pipe = ldm3d_pipe.to(device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
negative_prompt = "french fries"
|
||||
output = ldm3d_pipe(**inputs, negative_prompt=negative_prompt)
|
||||
|
||||
rgb, depth = output.rgb, output.depth
|
||||
rgb_slice = rgb[0, -3:, -3:, -1]
|
||||
depth_slice = depth[0, -3:, -1]
|
||||
|
||||
assert rgb.shape == (1, 64, 64, 3)
|
||||
assert depth.shape == (1, 64, 64)
|
||||
|
||||
expected_slice_rgb = np.array(
|
||||
[0.37044, 0.71811503, 0.7223251, 0.48603675, 0.5638391, 0.6364948, 0.42833704, 0.4901315, 0.47926217]
|
||||
)
|
||||
expected_slice_depth = np.array([107.84738, 84.62802, 89.962135])
|
||||
assert np.abs(rgb_slice.flatten() - expected_slice_rgb).max() < 1e-2
|
||||
assert np.abs(depth_slice.flatten() - expected_slice_depth).max() < 1e-2
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
|
||||
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
|
||||
inputs = {
|
||||
"prompt": "a photograph of an astronaut riding a horse",
|
||||
"latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 7.5,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_ldm3d_stable_diffusion(self):
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d")
|
||||
ldm3d_pipe = ldm3d_pipe.to(torch_device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb, depth = output.rgb, output.depth
|
||||
rgb_slice = rgb[0, -3:, -3:, -1].flatten()
|
||||
depth_slice = rgb[0, -3:, -1].flatten()
|
||||
|
||||
assert rgb.shape == (1, 512, 512, 3)
|
||||
assert depth.shape == (1, 512, 512)
|
||||
|
||||
expected_slice_rgb = np.array(
|
||||
[0.53805465, 0.56707305, 0.5486515, 0.57012236, 0.5814511, 0.56253487, 0.54843014, 0.55092263, 0.6459706]
|
||||
)
|
||||
expected_slice_depth = np.array(
|
||||
[0.9263781, 0.6678672, 0.5486515, 0.92202145, 0.67831135, 0.56253487, 0.9241694, 0.7551478, 0.6459706]
|
||||
)
|
||||
assert np.abs(rgb_slice - expected_slice_rgb).max() < 3e-3
|
||||
assert np.abs(depth_slice - expected_slice_depth).max() < 3e-3
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
|
||||
generator = torch.Generator(device=generator_device).manual_seed(seed)
|
||||
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
|
||||
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
|
||||
inputs = {
|
||||
"prompt": "a photograph of an astronaut riding a horse",
|
||||
"latents": latents,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 50,
|
||||
"guidance_scale": 7.5,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_ldm3d(self):
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d").to(torch_device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb, depth = output.rgb, output.depth
|
||||
|
||||
expected_rgb_mean = 0.495586
|
||||
expected_rgb_std = 0.33795515
|
||||
expected_depth_mean = 112.48518
|
||||
expected_depth_std = 98.489746
|
||||
assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
|
||||
assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
|
||||
assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
|
||||
assert np.abs(expected_depth_std - depth.std()) < 1e-3
|
||||
|
||||
def test_ldm3d_v2(self):
|
||||
ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device)
|
||||
ldm3d_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
output = ldm3d_pipe(**inputs)
|
||||
rgb, depth = output.rgb, output.depth
|
||||
|
||||
expected_rgb_mean = 0.4194127
|
||||
expected_rgb_std = 0.35375586
|
||||
expected_depth_mean = 0.5638502
|
||||
expected_depth_std = 0.34686103
|
||||
|
||||
assert rgb.shape == (1, 512, 512, 3)
|
||||
assert depth.shape == (1, 512, 512, 1)
|
||||
assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3
|
||||
assert np.abs(expected_rgb_std - rgb.std()) < 1e-3
|
||||
assert np.abs(expected_depth_mean - depth.mean()) < 1e-3
|
||||
assert np.abs(expected_depth_std - depth.std()) < 1e-3
|
||||
@@ -1,444 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
EulerAncestralDiscreteScheduler,
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
StableDiffusionPanoramaPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
backend_max_memory_allocated,
|
||||
backend_reset_max_memory_allocated,
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
IPAdapterTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
@skip_mps
|
||||
class StableDiffusionPanoramaPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionPanoramaPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=1,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
scheduler = DDIMScheduler()
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "a photo of the dolomites",
|
||||
"generator": generator,
|
||||
# Setting height and width to None to prevent OOMs on CPU.
|
||||
"height": None,
|
||||
"width": None,
|
||||
"num_inference_steps": 1,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_stable_diffusion_panorama_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6186, 0.5374, 0.4915, 0.4135, 0.4114, 0.4563, 0.5128, 0.4977, 0.4757])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_circular_padding_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs, circular_padding=True).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# override to speed the overall test timing up.
|
||||
def test_inference_batch_consistent(self):
|
||||
super().test_inference_batch_consistent(batch_sizes=[1, 2])
|
||||
|
||||
# override to speed the overall test timing up.
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=5.0e-3)
|
||||
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(expected_max_diff=1e-1)
|
||||
|
||||
def test_stable_diffusion_panorama_negative_prompt(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
negative_prompt = "french fries"
|
||||
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_views_batch(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = sd_pipe(**inputs, view_batch_size=2)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_views_batch_circular_padding(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
output = sd_pipe(**inputs, circular_padding=True, view_batch_size=2)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6127, 0.6299, 0.4595, 0.4051, 0.4543, 0.3925, 0.5510, 0.5693, 0.5031])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_euler(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = EulerAncestralDiscreteScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
|
||||
)
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.4024, 0.6510, 0.4901, 0.5378, 0.5813, 0.5622, 0.4795, 0.4467, 0.4952])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_pndm(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
components["scheduler"] = PNDMScheduler(
|
||||
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
|
||||
)
|
||||
sd_pipe = StableDiffusionPanoramaPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
image = sd_pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionPanoramaNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, seed=0):
|
||||
generator = torch.manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "a photo of the dolomites",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 7.5,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_stable_diffusion_panorama_default(self):
|
||||
model_ckpt = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs()
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
|
||||
assert image.shape == (1, 512, 2048, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.36968392,
|
||||
0.27025372,
|
||||
0.32446766,
|
||||
0.28379387,
|
||||
0.36363274,
|
||||
0.30733347,
|
||||
0.27100027,
|
||||
0.27054125,
|
||||
0.25536096,
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(expected_slice - image_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_k_lms(self):
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-2-base", safety_checker=None
|
||||
)
|
||||
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.unet.set_default_attn_processor()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs()
|
||||
image = pipe(**inputs).images
|
||||
image_slice = image[0, -3:, -3:, -1].flatten()
|
||||
assert image.shape == (1, 512, 2048, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(expected_slice - image_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_panorama_intermediate_state(self):
|
||||
number_of_steps = 0
|
||||
|
||||
def callback_fn(step: int, timestep: int, latents: torch.Tensor) -> None:
|
||||
callback_fn.has_been_called = True
|
||||
nonlocal number_of_steps
|
||||
number_of_steps += 1
|
||||
if step == 1:
|
||||
latents = latents.detach().cpu().numpy()
|
||||
assert latents.shape == (1, 4, 64, 256)
|
||||
latents_slice = latents[0, -3:, -3:, -1]
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.18681869,
|
||||
0.33907816,
|
||||
0.5361276,
|
||||
0.14432865,
|
||||
-0.02856611,
|
||||
-0.73941123,
|
||||
0.23397987,
|
||||
0.47322682,
|
||||
-0.37823164,
|
||||
]
|
||||
)
|
||||
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
|
||||
elif step == 2:
|
||||
latents = latents.detach().cpu().numpy()
|
||||
assert latents.shape == (1, 4, 64, 256)
|
||||
latents_slice = latents[0, -3:, -3:, -1]
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.18539645,
|
||||
0.33987248,
|
||||
0.5378559,
|
||||
0.14437142,
|
||||
-0.02455261,
|
||||
-0.7338317,
|
||||
0.23990755,
|
||||
0.47356272,
|
||||
-0.3786505,
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
|
||||
|
||||
callback_fn.has_been_called = False
|
||||
|
||||
model_ckpt = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs()
|
||||
pipe(**inputs, callback=callback_fn, callback_steps=1)
|
||||
assert callback_fn.has_been_called
|
||||
assert number_of_steps == 3
|
||||
|
||||
def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):
|
||||
backend_empty_cache(torch_device)
|
||||
backend_reset_max_memory_allocated(torch_device)
|
||||
backend_reset_peak_memory_stats(torch_device)
|
||||
|
||||
model_ckpt = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
|
||||
pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing(1)
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
|
||||
inputs = self.get_inputs()
|
||||
_ = pipe(**inputs)
|
||||
|
||||
mem_bytes = backend_max_memory_allocated(torch_device)
|
||||
# make sure that less than 5.2 GB is allocated
|
||||
assert mem_bytes < 5.5 * 10**9
|
||||
@@ -1,497 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
|
||||
from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
|
||||
from diffusers.utils.testing_utils import (
|
||||
Expectations,
|
||||
backend_empty_cache,
|
||||
floats_tensor,
|
||||
nightly,
|
||||
require_accelerator,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
class SafeDiffusionPipelineFastTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
@property
|
||||
def dummy_image(self):
|
||||
batch_size = 1
|
||||
num_channels = 3
|
||||
sizes = (32, 32)
|
||||
|
||||
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
|
||||
return image
|
||||
|
||||
@property
|
||||
def dummy_cond_unet(self):
|
||||
torch.manual_seed(0)
|
||||
model = UNet2DConditionModel(
|
||||
block_out_channels=(32, 64),
|
||||
layers_per_block=2,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=32,
|
||||
)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_vae(self):
|
||||
torch.manual_seed(0)
|
||||
model = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config)
|
||||
|
||||
@property
|
||||
def dummy_extractor(self):
|
||||
def extract(*args, **kwargs):
|
||||
class Out:
|
||||
def __init__(self):
|
||||
self.pixel_values = torch.ones([0])
|
||||
|
||||
def to(self, device):
|
||||
self.pixel_values.to(device)
|
||||
return self
|
||||
|
||||
return Out()
|
||||
|
||||
return extract
|
||||
|
||||
def test_safe_diffusion_ddim(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
|
||||
image = output.images
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image_from_tuple = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5756, 0.6118, 0.5005, 0.5041, 0.5471, 0.4726, 0.4976, 0.4865, 0.4864])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_pndm(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = PNDMScheduler(skip_prk_steps=True)
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
|
||||
|
||||
image = output.images
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image_from_tuple = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5125, 0.5716, 0.4828, 0.5060, 0.5650, 0.4768, 0.5185, 0.4895, 0.4993])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_stable_diffusion_no_safety_checker(self):
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
|
||||
)
|
||||
assert isinstance(pipe, StableDiffusionPipeline)
|
||||
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
|
||||
assert pipe.safety_checker is None
|
||||
|
||||
image = pipe("example prompt", num_inference_steps=2).images[0]
|
||||
assert image is not None
|
||||
|
||||
# check that there's no error when saving a pipeline with one of the models being None
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
pipe.save_pretrained(tmpdirname)
|
||||
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
|
||||
|
||||
# sanity check that the pipeline still works
|
||||
assert pipe.safety_checker is None
|
||||
image = pipe("example prompt", num_inference_steps=2).images[0]
|
||||
assert image is not None
|
||||
|
||||
@require_accelerator
|
||||
def test_stable_diffusion_fp16(self):
|
||||
"""Test that stable diffusion works with fp16"""
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = PNDMScheduler(skip_prk_steps=True)
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# put models in fp16
|
||||
unet = unet.half()
|
||||
vae = vae.half()
|
||||
bert = bert.half()
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "A painting of a squirrel eating a burger"
|
||||
image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_harm_safe_stable_diffusion(self):
|
||||
sd_pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
|
||||
)
|
||||
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = (
|
||||
"portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle"
|
||||
" coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with"
|
||||
" anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and"
|
||||
" children from bahnhof zoo, detailed "
|
||||
)
|
||||
seed = 4003660346
|
||||
guidance_scale = 7
|
||||
|
||||
# without safety guidance (sld_guidance_scale = 0)
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=0,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0],
|
||||
("cuda", 7): [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176],
|
||||
("cuda", 8): [0.0076, 0.0058, 0.0012, 0, 0.0047, 0.0046, 0, 0, 0],
|
||||
}
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
# without safety guidance (strong configuration)
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=2000,
|
||||
sld_warmup_steps=7,
|
||||
sld_threshold=0.025,
|
||||
sld_momentum_scale=0.5,
|
||||
sld_mom_beta=0.7,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
|
||||
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293],
|
||||
("cuda", 7): [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719],
|
||||
("cuda", 8): [0.0443, 0.0439, 0.0381, 0.0336, 0.0408, 0.0345, 0.0405, 0.0338, 0.0293],
|
||||
}
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_nudity_safe_stable_diffusion(self):
|
||||
sd_pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
|
||||
)
|
||||
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
|
||||
seed = 2734971755
|
||||
guidance_scale = 7
|
||||
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=0,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): [0.3244, 0.3355, 0.3260, 0.3123, 0.3246, 0.3426, 0.3109, 0.3471, 0.4001],
|
||||
("cuda", 7): [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297],
|
||||
("cuda", 8): [0.3605, 0.3684, 0.3712, 0.3624, 0.3675, 0.3726, 0.3494, 0.3748, 0.4044],
|
||||
}
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=2000,
|
||||
sld_warmup_steps=7,
|
||||
sld_threshold=0.025,
|
||||
sld_momentum_scale=0.5,
|
||||
sld_mom_beta=0.7,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): [0.6178, 0.6260, 0.6194, 0.6435, 0.6265, 0.6461, 0.6567, 0.6576, 0.6444],
|
||||
("cuda", 7): [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443],
|
||||
("cuda", 8): [0.5892, 0.5959, 0.5914, 0.6123, 0.5982, 0.6141, 0.6180, 0.6262, 0.6171],
|
||||
}
|
||||
)
|
||||
|
||||
print(f"image_slice: {image_slice.flatten()}")
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_nudity_safetychecker_safe_stable_diffusion(self):
|
||||
sd_pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = (
|
||||
"the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c."
|
||||
" leyendecker"
|
||||
)
|
||||
seed = 1044355234
|
||||
guidance_scale = 12
|
||||
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=0,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7
|
||||
|
||||
generator = torch.manual_seed(seed)
|
||||
output = sd_pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
guidance_scale=guidance_scale,
|
||||
num_inference_steps=50,
|
||||
output_type="np",
|
||||
width=512,
|
||||
height=512,
|
||||
sld_guidance_scale=2000,
|
||||
sld_warmup_steps=7,
|
||||
sld_threshold=0.025,
|
||||
sld_momentum_scale=0.5,
|
||||
sld_mom_beta=0.7,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slices = Expectations(
|
||||
{
|
||||
("xpu", 3): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]),
|
||||
("cuda", 7): np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561]),
|
||||
("cuda", 8): np.array([0.0695, 0.1244, 0.1831, 0.0527, 0.0444, 0.1660, 0.0572, 0.0677, 0.1551]),
|
||||
}
|
||||
)
|
||||
expected_slice = expected_slices.get_expectation()
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -1,245 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
DEISMultistepScheduler,
|
||||
DPMSolverMultistepScheduler,
|
||||
EulerDiscreteScheduler,
|
||||
StableDiffusionSAGPipeline,
|
||||
UNet2DConditionModel,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import (
|
||||
IPAdapterTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
)
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class StableDiffusionSAGPipelineFastTests(
|
||||
IPAdapterTesterMixin,
|
||||
PipelineLatentTesterMixin,
|
||||
PipelineTesterMixin,
|
||||
PipelineFromPipeTesterMixin,
|
||||
unittest.TestCase,
|
||||
):
|
||||
pipeline_class = StableDiffusionSAGPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(4, 8),
|
||||
layers_per_block=2,
|
||||
sample_size=8,
|
||||
norm_num_groups=1,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
cross_attention_dim=8,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[4, 8],
|
||||
norm_num_groups=1,
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=8,
|
||||
num_hidden_layers=2,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"safety_checker": None,
|
||||
"feature_extractor": None,
|
||||
"image_encoder": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": ".",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 1.0,
|
||||
"sag_scale": 1.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=3e-3)
|
||||
|
||||
@unittest.skip("Not necessary to test here.")
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
pass
|
||||
|
||||
def test_pipeline_different_schedulers(self):
|
||||
pipeline = self.pipeline_class(**self.get_dummy_components())
|
||||
inputs = self.get_dummy_inputs("cpu")
|
||||
|
||||
expected_image_size = (16, 16, 3)
|
||||
for scheduler_cls in [DDIMScheduler, DEISMultistepScheduler, DPMSolverMultistepScheduler]:
|
||||
pipeline.scheduler = scheduler_cls.from_config(pipeline.scheduler.config)
|
||||
image = pipeline(**inputs).images[0]
|
||||
|
||||
shape = image.shape
|
||||
assert shape == expected_image_size
|
||||
|
||||
pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
# Karras schedulers are not supported
|
||||
image = pipeline(**inputs).images[0]
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_stable_diffusion_1(self):
|
||||
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
|
||||
sag_pipe = sag_pipe.to(torch_device)
|
||||
sag_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "."
|
||||
generator = torch.manual_seed(0)
|
||||
output = sag_pipe(
|
||||
[prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
|
||||
)
|
||||
|
||||
image = output.images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
|
||||
|
||||
def test_stable_diffusion_2(self):
|
||||
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
|
||||
sag_pipe = sag_pipe.to(torch_device)
|
||||
sag_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "."
|
||||
generator = torch.manual_seed(0)
|
||||
output = sag_pipe(
|
||||
[prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
|
||||
)
|
||||
|
||||
image = output.images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
|
||||
|
||||
def test_stable_diffusion_2_non_square(self):
|
||||
sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
|
||||
sag_pipe = sag_pipe.to(torch_device)
|
||||
sag_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "."
|
||||
generator = torch.manual_seed(0)
|
||||
output = sag_pipe(
|
||||
[prompt],
|
||||
width=768,
|
||||
height=512,
|
||||
generator=generator,
|
||||
guidance_scale=7.5,
|
||||
sag_scale=1.0,
|
||||
num_inference_steps=20,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.images
|
||||
|
||||
assert image.shape == (1, 512, 768, 3)
|
||||
@@ -1,231 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoSDPipeline, UNet3DConditionModel
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
load_numpy,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
@skip_mps
|
||||
class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase):
|
||||
pipeline_class = TextToVideoSDPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
# No `output_type`.
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback",
|
||||
"callback_steps",
|
||||
]
|
||||
)
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet3DConditionModel(
|
||||
block_out_channels=(8, 8),
|
||||
layers_per_block=1,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
|
||||
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
|
||||
cross_attention_dim=4,
|
||||
attention_head_dim=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=False,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=(8,),
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=32,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=4,
|
||||
intermediate_size=16,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=2,
|
||||
num_hidden_layers=2,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
return super().test_dict_tuple_outputs_equivalent()
|
||||
|
||||
def test_text_to_video_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = TextToVideoSDPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["output_type"] = "np"
|
||||
frames = sd_pipe(**inputs).frames
|
||||
|
||||
image_slice = frames[0][0][-3:, -3:, -1]
|
||||
assert frames[0][0].shape == (32, 32, 3)
|
||||
expected_slice = np.array([0.8093, 0.2751, 0.6976, 0.5927, 0.4616, 0.4336, 0.5094, 0.5683, 0.4796])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@unittest.skipIf(torch_device != "cuda", reason="Feature isn't heavily used. Test in CUDA environment only.")
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
|
||||
|
||||
# (todo): sayakpaul
|
||||
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
|
||||
def test_inference_batch_consistent(self):
|
||||
pass
|
||||
|
||||
# (todo): sayakpaul
|
||||
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
|
||||
def test_num_images_per_prompt(self):
|
||||
pass
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"num_images_per_prompt": 1,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@slow
|
||||
@skip_mps
|
||||
@require_torch_accelerator
|
||||
class TextToVideoSDPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_two_step_model(self):
|
||||
expected_video = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/video_2step.npy"
|
||||
)
|
||||
|
||||
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
prompt = "Spiderman is surfing"
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
|
||||
assert numpy_cosine_similarity_distance(expected_video.flatten(), video_frames.flatten()) < 1e-4
|
||||
|
||||
def test_two_step_model_with_freeu(self):
|
||||
expected_video = []
|
||||
|
||||
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
prompt = "Spiderman is surfing"
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
|
||||
video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").frames
|
||||
video = video_frames[0, 0, -3:, -3:, -1].flatten()
|
||||
|
||||
expected_video = [0.3643, 0.3455, 0.3831, 0.3923, 0.2978, 0.3247, 0.3278, 0.3201, 0.3475]
|
||||
|
||||
assert np.abs(expected_video - video).mean() < 5e-2
|
||||
@@ -1,62 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import DDIMScheduler, TextToVideoZeroPipeline
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
load_pt,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..test_pipelines_common import assert_mean_pixel_difference
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class TextToVideoZeroPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_full_model(self):
|
||||
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(torch_device)
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
prompt = "A bear is playing a guitar on Times Square"
|
||||
result = pipe(prompt=prompt, generator=generator).images
|
||||
|
||||
expected_result = load_pt(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text-to-video/A bear is playing a guitar on Times Square.pt",
|
||||
weights_only=False,
|
||||
)
|
||||
|
||||
assert_mean_pixel_difference(result, expected_result)
|
||||
@@ -1,403 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import inspect
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import AutoencoderKL, DDIMScheduler, TextToVideoZeroSDXLPipeline, UNet2DConditionModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
require_accelerate_version_greater,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineFromPipeTesterMixin, PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
def to_np(tensor):
|
||||
if isinstance(tensor, torch.Tensor):
|
||||
tensor = tensor.detach().cpu().numpy()
|
||||
|
||||
return tensor
|
||||
|
||||
|
||||
class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase):
|
||||
pipeline_class = TextToVideoZeroSDXLPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
|
||||
generator_device = "cpu"
|
||||
|
||||
def get_dummy_components(self, seed=0):
|
||||
torch.manual_seed(seed)
|
||||
unet = UNet2DConditionModel(
|
||||
block_out_channels=(2, 4),
|
||||
layers_per_block=2,
|
||||
sample_size=2,
|
||||
norm_num_groups=2,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
|
||||
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
|
||||
# SD2-specific config below
|
||||
attention_head_dim=(2, 4),
|
||||
use_linear_projection=True,
|
||||
addition_embed_type="text_time",
|
||||
addition_time_embed_dim=8,
|
||||
transformer_layers_per_block=(1, 2),
|
||||
projection_class_embeddings_input_dim=80, # 6 * 8 + 32
|
||||
cross_attention_dim=64,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
num_train_timesteps=1000,
|
||||
beta_start=0.0001,
|
||||
beta_end=0.02,
|
||||
beta_schedule="linear",
|
||||
trained_betas=None,
|
||||
clip_sample=True,
|
||||
set_alpha_to_one=True,
|
||||
steps_offset=0,
|
||||
prediction_type="epsilon",
|
||||
thresholding=False,
|
||||
dynamic_thresholding_ratio=0.995,
|
||||
clip_sample_range=1.0,
|
||||
sample_max_value=1.0,
|
||||
timestep_spacing="leading",
|
||||
rescale_betas_zero_snr=False,
|
||||
)
|
||||
torch.manual_seed(seed)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[32, 64],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
|
||||
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=128,
|
||||
)
|
||||
torch.manual_seed(seed)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
# SD2-specific config below
|
||||
hidden_act="gelu",
|
||||
projection_dim=32,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
|
||||
tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder_2": text_encoder_2,
|
||||
"tokenizer_2": tokenizer_2,
|
||||
"image_encoder": None,
|
||||
"feature_extractor": None,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A panda dancing in Antarctica",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 5,
|
||||
"t0": 1,
|
||||
"t1": 3,
|
||||
"height": 64,
|
||||
"width": 64,
|
||||
"video_length": 3,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def get_generator(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
return generator
|
||||
|
||||
def test_text_to_video_zero_sdxl(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
result = pipe(**inputs).images
|
||||
|
||||
first_frame_slice = result[0, -3:, -3:, -1]
|
||||
last_frame_slice = result[-1, -3:, -3:, 0]
|
||||
|
||||
expected_slice1 = np.array(
|
||||
[0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788]
|
||||
)
|
||||
expected_slice2 = np.array(
|
||||
[0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245]
|
||||
)
|
||||
|
||||
assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
|
||||
assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
pass
|
||||
|
||||
def test_cfg(self):
|
||||
sig = inspect.signature(self.pipeline_class.__call__)
|
||||
if "guidance_scale" not in sig.parameters:
|
||||
return
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
|
||||
inputs["guidance_scale"] = 1.0
|
||||
out_no_cfg = pipe(**inputs)[0]
|
||||
|
||||
inputs["guidance_scale"] = 7.5
|
||||
out_cfg = pipe(**inputs)[0]
|
||||
|
||||
assert out_cfg.shape == out_no_cfg.shape
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(self.generator_device))[0]
|
||||
output_tuple = pipe(**self.get_dummy_inputs(self.generator_device), return_dict=False)[0]
|
||||
|
||||
max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
|
||||
self.assertLess(max_diff, expected_max_difference)
|
||||
|
||||
@unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
|
||||
@require_torch_accelerator
|
||||
def test_float16_inference(self, expected_max_diff=5e-2):
|
||||
components = self.get_dummy_components()
|
||||
for name, module in components.items():
|
||||
if hasattr(module, "half"):
|
||||
components[name] = module.to(torch_device).half()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
components = self.get_dummy_components()
|
||||
pipe_fp16 = self.pipeline_class(**components)
|
||||
pipe_fp16.to(torch_device, torch.float16)
|
||||
pipe_fp16.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
# # Reset generator in case it is used inside dummy inputs
|
||||
if "generator" in inputs:
|
||||
inputs["generator"] = self.get_generator(self.generator_device)
|
||||
|
||||
output = pipe(**inputs)[0]
|
||||
|
||||
fp16_inputs = self.get_dummy_inputs(self.generator_device)
|
||||
# Reset generator in case it is used inside dummy inputs
|
||||
if "generator" in fp16_inputs:
|
||||
fp16_inputs["generator"] = self.get_generator(self.generator_device)
|
||||
|
||||
output_fp16 = pipe_fp16(**fp16_inputs)[0]
|
||||
|
||||
max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
|
||||
self.assertLess(max_diff, expected_max_diff, "The outputs of the fp16 and fp32 pipelines are too different.")
|
||||
|
||||
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
|
||||
def test_inference_batch_consistent(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
|
||||
@require_torch_accelerator
|
||||
@require_accelerate_version_greater("0.17.0")
|
||||
def test_model_cpu_offload_forward_pass(self, expected_max_diff=2e-4):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
output_without_offload = pipe(**inputs)[0]
|
||||
|
||||
pipe.enable_model_cpu_offload(device=torch_device)
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
output_with_offload = pipe(**inputs)[0]
|
||||
|
||||
max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
|
||||
self.assertLess(max_diff, expected_max_diff, "CPU offloading should not affect the inference results")
|
||||
|
||||
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
|
||||
def test_pipeline_call_signature(self):
|
||||
pass
|
||||
|
||||
@unittest.skipIf(torch_device not in ["cuda", "xpu"], reason="float16 requires CUDA or XPU")
|
||||
@require_torch_accelerator
|
||||
def test_save_load_float16(self, expected_max_diff=1e-2):
|
||||
components = self.get_dummy_components()
|
||||
for name, module in components.items():
|
||||
if hasattr(module, "half"):
|
||||
components[name] = module.to(torch_device).half()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
output = pipe(**inputs)[0]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
pipe.save_pretrained(tmpdir)
|
||||
pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
|
||||
pipe_loaded.to(torch_device)
|
||||
pipe_loaded.set_progress_bar_config(disable=None)
|
||||
|
||||
for name, component in pipe_loaded.components.items():
|
||||
if hasattr(component, "dtype"):
|
||||
self.assertTrue(
|
||||
component.dtype == torch.float16,
|
||||
f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
|
||||
)
|
||||
|
||||
inputs = self.get_dummy_inputs(self.generator_device)
|
||||
output_loaded = pipe_loaded(**inputs)[0]
|
||||
max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
|
||||
self.assertLess(
|
||||
max_diff, expected_max_diff, "The output of the fp16 pipeline changed after saving and loading."
|
||||
)
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_save_load_local(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_save_load_optional_components(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_sequential_cpu_offload_forward_pass(self):
|
||||
pass
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_to_device(self):
|
||||
components = self.get_dummy_components()
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe.to("cpu")
|
||||
model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
|
||||
self.assertTrue(all(device == "cpu" for device in model_devices))
|
||||
|
||||
output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu
|
||||
self.assertTrue(np.isnan(output_cpu).sum() == 0)
|
||||
|
||||
pipe.to(torch_device)
|
||||
model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
|
||||
self.assertTrue(all(device == torch_device for device in model_devices))
|
||||
|
||||
output_device = pipe(**self.get_dummy_inputs("cpu"))[0] # generator set to cpu
|
||||
self.assertTrue(np.isnan(to_np(output_device)).sum() == 0)
|
||||
|
||||
@unittest.skip(
|
||||
reason="Cannot call `set_default_attn_processor` as this pipeline uses a specific attention processor."
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
pass
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class TextToVideoZeroSDXLPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_full_model(self):
|
||||
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
|
||||
model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
pipe.enable_vae_slicing()
|
||||
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
|
||||
prompt = "A panda dancing in Antarctica"
|
||||
result = pipe(prompt=prompt, generator=generator).images
|
||||
|
||||
first_frame_slice = result[0, -3:, -3:, -1]
|
||||
last_frame_slice = result[-1, -3:, -3:, 0]
|
||||
|
||||
expected_slice1 = np.array([0.57, 0.57, 0.57, 0.57, 0.57, 0.56, 0.55, 0.56, 0.56])
|
||||
expected_slice2 = np.array([0.54, 0.53, 0.53, 0.53, 0.53, 0.52, 0.53, 0.53, 0.53])
|
||||
|
||||
assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
|
||||
assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
|
||||
@@ -1,229 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DDIMScheduler,
|
||||
UNet3DConditionModel,
|
||||
VideoToVideoSDPipeline,
|
||||
)
|
||||
from diffusers.utils import is_xformers_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
is_flaky,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import (
|
||||
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
|
||||
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
@skip_mps
|
||||
class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = VideoToVideoSDPipeline
|
||||
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"}
|
||||
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"}
|
||||
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
|
||||
test_attention_slicing = False
|
||||
|
||||
# No `output_type`.
|
||||
required_optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"generator",
|
||||
"latents",
|
||||
"return_dict",
|
||||
"callback",
|
||||
"callback_steps",
|
||||
]
|
||||
)
|
||||
|
||||
def get_dummy_components(self):
|
||||
torch.manual_seed(0)
|
||||
unet = UNet3DConditionModel(
|
||||
block_out_channels=(4, 8),
|
||||
layers_per_block=1,
|
||||
sample_size=32,
|
||||
in_channels=4,
|
||||
out_channels=4,
|
||||
down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
|
||||
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
|
||||
cross_attention_dim=32,
|
||||
attention_head_dim=4,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
scheduler = DDIMScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
clip_sample=True,
|
||||
set_alpha_to_one=False,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
vae = AutoencoderKL(
|
||||
block_out_channels=[
|
||||
8,
|
||||
],
|
||||
in_channels=3,
|
||||
out_channels=3,
|
||||
down_block_types=[
|
||||
"DownEncoderBlock2D",
|
||||
],
|
||||
up_block_types=["UpDecoderBlock2D"],
|
||||
latent_channels=4,
|
||||
sample_size=32,
|
||||
norm_num_groups=2,
|
||||
)
|
||||
torch.manual_seed(0)
|
||||
text_encoder_config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=32,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
hidden_act="gelu",
|
||||
projection_dim=512,
|
||||
)
|
||||
text_encoder = CLIPTextModel(text_encoder_config)
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
components = {
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
# 3 frames
|
||||
video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "A painting of a squirrel eating a burger",
|
||||
"video": video,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_text_to_video_default_case(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = VideoToVideoSDPipeline(**components)
|
||||
sd_pipe = sd_pipe.to(device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
inputs["output_type"] = "np"
|
||||
frames = sd_pipe(**inputs).frames
|
||||
image_slice = frames[0][0][-3:, -3:, -1]
|
||||
|
||||
assert frames[0][0].shape == (32, 32, 3)
|
||||
expected_slice = np.array([0.6391, 0.5350, 0.5202, 0.5521, 0.5453, 0.5393, 0.6652, 0.5270, 0.5185])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@is_flaky()
|
||||
def test_save_load_optional_components(self):
|
||||
super().test_save_load_optional_components(expected_max_difference=0.001)
|
||||
|
||||
@is_flaky()
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
super().test_dict_tuple_outputs_equivalent()
|
||||
|
||||
@is_flaky()
|
||||
def test_save_load_local(self):
|
||||
super().test_save_load_local()
|
||||
|
||||
@unittest.skipIf(
|
||||
torch_device != "cuda" or not is_xformers_available(),
|
||||
reason="XFormers attention is only available with CUDA and `xformers` installed",
|
||||
)
|
||||
def test_xformers_attention_forwardGenerator_pass(self):
|
||||
self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3)
|
||||
|
||||
# (todo): sayakpaul
|
||||
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
|
||||
def test_inference_batch_consistent(self):
|
||||
pass
|
||||
|
||||
# (todo): sayakpaul
|
||||
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
|
||||
def test_inference_batch_single_identical(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
|
||||
def test_num_images_per_prompt(self):
|
||||
pass
|
||||
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
extra_required_param_value_dict = {
|
||||
"device": torch.device(torch_device).type,
|
||||
"num_images_per_prompt": 1,
|
||||
"do_classifier_free_guidance": self.get_dummy_inputs(device=torch_device).get("guidance_scale", 1.0) > 1.0,
|
||||
}
|
||||
return super().test_encode_prompt_works_in_isolation(extra_required_param_value_dict)
|
||||
|
||||
|
||||
@nightly
|
||||
@skip_mps
|
||||
class VideoToVideoSDPipelineSlowTests(unittest.TestCase):
|
||||
def test_two_step_model(self):
|
||||
pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
# 10 frames
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
video = torch.randn((1, 10, 3, 320, 576), generator=generator)
|
||||
|
||||
prompt = "Spiderman is surfing"
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="np").frames
|
||||
|
||||
expected_array = np.array(
|
||||
[0.17114258, 0.13720703, 0.08886719, 0.14819336, 0.1730957, 0.24584961, 0.22021484, 0.35180664, 0.2607422]
|
||||
)
|
||||
output_array = video_frames[0, 0, :3, :3, 0].flatten()
|
||||
assert numpy_cosine_similarity_distance(expected_array, output_array) < 1e-3
|
||||
@@ -1,523 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
|
||||
|
||||
from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
|
||||
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
backend_max_memory_allocated,
|
||||
backend_reset_max_memory_allocated,
|
||||
backend_reset_peak_memory_stats,
|
||||
enable_full_determinism,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = UnCLIPPipeline
|
||||
params = TEXT_TO_IMAGE_PARAMS - {
|
||||
"negative_prompt",
|
||||
"height",
|
||||
"width",
|
||||
"negative_prompt_embeds",
|
||||
"guidance_scale",
|
||||
"prompt_embeds",
|
||||
"cross_attention_kwargs",
|
||||
}
|
||||
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
"return_dict",
|
||||
"prior_num_inference_steps",
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
test_xformers_attention = False
|
||||
|
||||
@property
|
||||
def text_embedder_hidden_size(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def time_input_dim(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def block_out_channels_0(self):
|
||||
return self.time_input_dim
|
||||
|
||||
@property
|
||||
def time_embed_dim(self):
|
||||
return self.time_input_dim * 4
|
||||
|
||||
@property
|
||||
def cross_attention_dim(self):
|
||||
return 100
|
||||
|
||||
@property
|
||||
def dummy_tokenizer(self):
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
return tokenizer
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
projection_dim=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModelWithProjection(config)
|
||||
|
||||
@property
|
||||
def dummy_prior(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"num_attention_heads": 2,
|
||||
"attention_head_dim": 12,
|
||||
"embedding_dim": self.text_embedder_hidden_size,
|
||||
"num_layers": 1,
|
||||
}
|
||||
|
||||
model = PriorTransformer(**model_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_text_proj(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"clip_embeddings_dim": self.text_embedder_hidden_size,
|
||||
"time_embed_dim": self.time_embed_dim,
|
||||
"cross_attention_dim": self.cross_attention_dim,
|
||||
}
|
||||
|
||||
model = UnCLIPTextProjModel(**model_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_decoder(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"sample_size": 32,
|
||||
# RGB in channels
|
||||
"in_channels": 3,
|
||||
# Out channels is double in channels because predicts mean and variance
|
||||
"out_channels": 6,
|
||||
"down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
|
||||
"up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
|
||||
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
|
||||
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
|
||||
"layers_per_block": 1,
|
||||
"cross_attention_dim": self.cross_attention_dim,
|
||||
"attention_head_dim": 4,
|
||||
"resnet_time_scale_shift": "scale_shift",
|
||||
"class_embed_type": "identity",
|
||||
}
|
||||
|
||||
model = UNet2DConditionModel(**model_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_super_res_kwargs(self):
|
||||
return {
|
||||
"sample_size": 64,
|
||||
"layers_per_block": 1,
|
||||
"down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
|
||||
"up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
|
||||
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
|
||||
"in_channels": 6,
|
||||
"out_channels": 3,
|
||||
}
|
||||
|
||||
@property
|
||||
def dummy_super_res_first(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model = UNet2DModel(**self.dummy_super_res_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_super_res_last(self):
|
||||
# seeded differently to get different unet than `self.dummy_super_res_first`
|
||||
torch.manual_seed(1)
|
||||
|
||||
model = UNet2DModel(**self.dummy_super_res_kwargs)
|
||||
return model
|
||||
|
||||
def get_dummy_components(self):
|
||||
prior = self.dummy_prior
|
||||
decoder = self.dummy_decoder
|
||||
text_proj = self.dummy_text_proj
|
||||
text_encoder = self.dummy_text_encoder
|
||||
tokenizer = self.dummy_tokenizer
|
||||
super_res_first = self.dummy_super_res_first
|
||||
super_res_last = self.dummy_super_res_last
|
||||
|
||||
prior_scheduler = UnCLIPScheduler(
|
||||
variance_type="fixed_small_log",
|
||||
prediction_type="sample",
|
||||
num_train_timesteps=1000,
|
||||
clip_sample_range=5.0,
|
||||
)
|
||||
|
||||
decoder_scheduler = UnCLIPScheduler(
|
||||
variance_type="learned_range",
|
||||
prediction_type="epsilon",
|
||||
num_train_timesteps=1000,
|
||||
)
|
||||
|
||||
super_res_scheduler = UnCLIPScheduler(
|
||||
variance_type="fixed_small_log",
|
||||
prediction_type="epsilon",
|
||||
num_train_timesteps=1000,
|
||||
)
|
||||
|
||||
components = {
|
||||
"prior": prior,
|
||||
"decoder": decoder,
|
||||
"text_proj": text_proj,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"super_res_first": super_res_first,
|
||||
"super_res_last": super_res_last,
|
||||
"prior_scheduler": prior_scheduler,
|
||||
"decoder_scheduler": decoder_scheduler,
|
||||
"super_res_scheduler": super_res_scheduler,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "horse",
|
||||
"generator": generator,
|
||||
"prior_num_inference_steps": 2,
|
||||
"decoder_num_inference_steps": 2,
|
||||
"super_res_num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_unclip(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(device))
|
||||
image = output.images
|
||||
|
||||
image_from_tuple = pipe(
|
||||
**self.get_dummy_inputs(device),
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.9997,
|
||||
0.9988,
|
||||
0.0028,
|
||||
0.9997,
|
||||
0.9984,
|
||||
0.9965,
|
||||
0.0029,
|
||||
0.9986,
|
||||
0.0025,
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_unclip_passed_text_embed(self):
|
||||
device = torch.device("cpu")
|
||||
|
||||
class DummyScheduler:
|
||||
init_noise_sigma = 1
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
prior = components["prior"]
|
||||
decoder = components["decoder"]
|
||||
super_res_first = components["super_res_first"]
|
||||
tokenizer = components["tokenizer"]
|
||||
text_encoder = components["text_encoder"]
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
dtype = prior.dtype
|
||||
batch_size = 1
|
||||
|
||||
shape = (batch_size, prior.config.embedding_dim)
|
||||
prior_latents = pipe.prepare_latents(
|
||||
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
|
||||
)
|
||||
shape = (batch_size, decoder.config.in_channels, decoder.config.sample_size, decoder.config.sample_size)
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
decoder_latents = pipe.prepare_latents(
|
||||
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
|
||||
)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
super_res_first.config.in_channels // 2,
|
||||
super_res_first.config.sample_size,
|
||||
super_res_first.config.sample_size,
|
||||
)
|
||||
super_res_latents = pipe.prepare_latents(
|
||||
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
|
||||
)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "this is a prompt example"
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
output = pipe(
|
||||
[prompt],
|
||||
generator=generator,
|
||||
prior_num_inference_steps=2,
|
||||
decoder_num_inference_steps=2,
|
||||
super_res_num_inference_steps=2,
|
||||
prior_latents=prior_latents,
|
||||
decoder_latents=decoder_latents,
|
||||
super_res_latents=super_res_latents,
|
||||
output_type="np",
|
||||
)
|
||||
image = output.images
|
||||
|
||||
text_inputs = tokenizer(
|
||||
prompt,
|
||||
padding="max_length",
|
||||
max_length=tokenizer.model_max_length,
|
||||
return_tensors="pt",
|
||||
)
|
||||
text_model_output = text_encoder(text_inputs.input_ids)
|
||||
text_attention_mask = text_inputs.attention_mask
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image_from_text = pipe(
|
||||
generator=generator,
|
||||
prior_num_inference_steps=2,
|
||||
decoder_num_inference_steps=2,
|
||||
super_res_num_inference_steps=2,
|
||||
prior_latents=prior_latents,
|
||||
decoder_latents=decoder_latents,
|
||||
super_res_latents=super_res_latents,
|
||||
text_model_output=text_model_output,
|
||||
text_attention_mask=text_attention_mask,
|
||||
output_type="np",
|
||||
)[0]
|
||||
|
||||
# make sure passing text embeddings manually is identical
|
||||
assert np.abs(image - image_from_text).max() < 1e-4
|
||||
|
||||
# Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
|
||||
# because UnCLIP GPU undeterminism requires a looser check.
|
||||
@skip_mps
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
test_max_difference = torch_device == "cpu"
|
||||
|
||||
self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference, expected_max_diff=0.01)
|
||||
|
||||
# Overriding PipelineTesterMixin::test_inference_batch_single_identical
|
||||
# because UnCLIP undeterminism requires a looser check.
|
||||
@skip_mps
|
||||
def test_inference_batch_single_identical(self):
|
||||
additional_params_copy_to_batched_inputs = [
|
||||
"prior_num_inference_steps",
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
|
||||
self._test_inference_batch_single_identical(
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=9.8e-3
|
||||
)
|
||||
|
||||
def test_inference_batch_consistent(self):
|
||||
additional_params_copy_to_batched_inputs = [
|
||||
"prior_num_inference_steps",
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
|
||||
if torch_device == "mps":
|
||||
# TODO: MPS errors with larger batch sizes
|
||||
batch_sizes = [2, 3]
|
||||
self._test_inference_batch_consistent(
|
||||
batch_sizes=batch_sizes,
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
|
||||
)
|
||||
else:
|
||||
self._test_inference_batch_consistent(
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
|
||||
)
|
||||
|
||||
@skip_mps
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
return super().test_dict_tuple_outputs_equivalent()
|
||||
|
||||
@skip_mps
|
||||
def test_save_load_local(self):
|
||||
return super().test_save_load_local(expected_max_difference=5e-3)
|
||||
|
||||
@skip_mps
|
||||
def test_save_load_optional_components(self):
|
||||
return super().test_save_load_optional_components()
|
||||
|
||||
@unittest.skip("UnCLIP produces very large differences in fp16 vs fp32. Test is not useful.")
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(expected_max_diff=1.0)
|
||||
|
||||
|
||||
@nightly
|
||||
class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_unclip_karlo_cpu_fp32(self):
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/unclip/karlo_v1_alpha_horse_cpu.npy"
|
||||
)
|
||||
|
||||
pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
|
||||
pipeline.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
output = pipeline(
|
||||
"horse",
|
||||
num_images_per_prompt=1,
|
||||
generator=generator,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (256, 256, 3)
|
||||
assert np.abs(expected_image - image).max() < 1e-1
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class UnCLIPPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_unclip_karlo(self):
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/unclip/karlo_v1_alpha_horse_fp16.npy"
|
||||
)
|
||||
|
||||
pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
|
||||
pipeline = pipeline.to(torch_device)
|
||||
pipeline.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
output = pipeline(
|
||||
"horse",
|
||||
generator=generator,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (256, 256, 3)
|
||||
|
||||
assert_mean_pixel_difference(image, expected_image)
|
||||
|
||||
def test_unclip_pipeline_with_sequential_cpu_offloading(self):
|
||||
backend_empty_cache(torch_device)
|
||||
backend_reset_max_memory_allocated(torch_device)
|
||||
backend_reset_peak_memory_stats(torch_device)
|
||||
|
||||
pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
|
||||
_ = pipe(
|
||||
"horse",
|
||||
num_images_per_prompt=1,
|
||||
prior_num_inference_steps=2,
|
||||
decoder_num_inference_steps=2,
|
||||
super_res_num_inference_steps=2,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
mem_bytes = backend_max_memory_allocated(torch_device)
|
||||
# make sure that less than 7 GB is allocated
|
||||
assert mem_bytes < 7 * 10**9
|
||||
@@ -1,540 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import (
|
||||
CLIPImageProcessor,
|
||||
CLIPTextConfig,
|
||||
CLIPTextModelWithProjection,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionConfig,
|
||||
CLIPVisionModelWithProjection,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
DiffusionPipeline,
|
||||
UnCLIPImageVariationPipeline,
|
||||
UnCLIPScheduler,
|
||||
UNet2DConditionModel,
|
||||
UNet2DModel,
|
||||
)
|
||||
from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
load_numpy,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
skip_mps,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
|
||||
from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = UnCLIPImageVariationPipeline
|
||||
params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
|
||||
batch_params = IMAGE_VARIATION_BATCH_PARAMS
|
||||
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
"return_dict",
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
test_xformers_attention = False
|
||||
supports_dduf = False
|
||||
|
||||
@property
|
||||
def text_embedder_hidden_size(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def time_input_dim(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def block_out_channels_0(self):
|
||||
return self.time_input_dim
|
||||
|
||||
@property
|
||||
def time_embed_dim(self):
|
||||
return self.time_input_dim * 4
|
||||
|
||||
@property
|
||||
def cross_attention_dim(self):
|
||||
return 100
|
||||
|
||||
@property
|
||||
def dummy_tokenizer(self):
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
return tokenizer
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
projection_dim=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModelWithProjection(config)
|
||||
|
||||
@property
|
||||
def dummy_image_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPVisionConfig(
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
projection_dim=self.text_embedder_hidden_size,
|
||||
num_hidden_layers=5,
|
||||
num_attention_heads=4,
|
||||
image_size=32,
|
||||
intermediate_size=37,
|
||||
patch_size=1,
|
||||
)
|
||||
return CLIPVisionModelWithProjection(config)
|
||||
|
||||
@property
|
||||
def dummy_text_proj(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"clip_embeddings_dim": self.text_embedder_hidden_size,
|
||||
"time_embed_dim": self.time_embed_dim,
|
||||
"cross_attention_dim": self.cross_attention_dim,
|
||||
}
|
||||
|
||||
model = UnCLIPTextProjModel(**model_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_decoder(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"sample_size": 32,
|
||||
# RGB in channels
|
||||
"in_channels": 3,
|
||||
# Out channels is double in channels because predicts mean and variance
|
||||
"out_channels": 6,
|
||||
"down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
|
||||
"up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
|
||||
"mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
|
||||
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
|
||||
"layers_per_block": 1,
|
||||
"cross_attention_dim": self.cross_attention_dim,
|
||||
"attention_head_dim": 4,
|
||||
"resnet_time_scale_shift": "scale_shift",
|
||||
"class_embed_type": "identity",
|
||||
}
|
||||
|
||||
model = UNet2DConditionModel(**model_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_super_res_kwargs(self):
|
||||
return {
|
||||
"sample_size": 64,
|
||||
"layers_per_block": 1,
|
||||
"down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
|
||||
"up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
|
||||
"block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
|
||||
"in_channels": 6,
|
||||
"out_channels": 3,
|
||||
}
|
||||
|
||||
@property
|
||||
def dummy_super_res_first(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model = UNet2DModel(**self.dummy_super_res_kwargs)
|
||||
return model
|
||||
|
||||
@property
|
||||
def dummy_super_res_last(self):
|
||||
# seeded differently to get different unet than `self.dummy_super_res_first`
|
||||
torch.manual_seed(1)
|
||||
|
||||
model = UNet2DModel(**self.dummy_super_res_kwargs)
|
||||
return model
|
||||
|
||||
def get_dummy_components(self):
|
||||
decoder = self.dummy_decoder
|
||||
text_proj = self.dummy_text_proj
|
||||
text_encoder = self.dummy_text_encoder
|
||||
tokenizer = self.dummy_tokenizer
|
||||
super_res_first = self.dummy_super_res_first
|
||||
super_res_last = self.dummy_super_res_last
|
||||
|
||||
decoder_scheduler = UnCLIPScheduler(
|
||||
variance_type="learned_range",
|
||||
prediction_type="epsilon",
|
||||
num_train_timesteps=1000,
|
||||
)
|
||||
|
||||
super_res_scheduler = UnCLIPScheduler(
|
||||
variance_type="fixed_small_log",
|
||||
prediction_type="epsilon",
|
||||
num_train_timesteps=1000,
|
||||
)
|
||||
|
||||
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
|
||||
|
||||
image_encoder = self.dummy_image_encoder
|
||||
|
||||
return {
|
||||
"decoder": decoder,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"text_proj": text_proj,
|
||||
"feature_extractor": feature_extractor,
|
||||
"image_encoder": image_encoder,
|
||||
"super_res_first": super_res_first,
|
||||
"super_res_last": super_res_last,
|
||||
"decoder_scheduler": decoder_scheduler,
|
||||
"super_res_scheduler": super_res_scheduler,
|
||||
}
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0, pil_image=True):
|
||||
input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
if pil_image:
|
||||
input_image = input_image * 0.5 + 0.5
|
||||
input_image = input_image.clamp(0, 1)
|
||||
input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
|
||||
input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
|
||||
|
||||
return {
|
||||
"image": input_image,
|
||||
"generator": generator,
|
||||
"decoder_num_inference_steps": 2,
|
||||
"super_res_num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
|
||||
def test_unclip_image_variation_input_tensor(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
|
||||
|
||||
output = pipe(**pipeline_inputs)
|
||||
image = output.images
|
||||
|
||||
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
|
||||
|
||||
image_from_tuple = pipe(
|
||||
**tuple_pipeline_inputs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.9997,
|
||||
0.0002,
|
||||
0.9997,
|
||||
0.9997,
|
||||
0.9969,
|
||||
0.0023,
|
||||
0.9997,
|
||||
0.9969,
|
||||
0.9970,
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_unclip_image_variation_input_image(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
|
||||
|
||||
output = pipe(**pipeline_inputs)
|
||||
image = output.images
|
||||
|
||||
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
|
||||
|
||||
image_from_tuple = pipe(
|
||||
**tuple_pipeline_inputs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_unclip_image_variation_input_list_images(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
|
||||
pipeline_inputs["image"] = [
|
||||
pipeline_inputs["image"],
|
||||
pipeline_inputs["image"],
|
||||
]
|
||||
|
||||
output = pipe(**pipeline_inputs)
|
||||
image = output.images
|
||||
|
||||
tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
|
||||
tuple_pipeline_inputs["image"] = [
|
||||
tuple_pipeline_inputs["image"],
|
||||
tuple_pipeline_inputs["image"],
|
||||
]
|
||||
|
||||
image_from_tuple = pipe(
|
||||
**tuple_pipeline_inputs,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (2, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
0.9997,
|
||||
0.9989,
|
||||
0.0008,
|
||||
0.0021,
|
||||
0.9960,
|
||||
0.0018,
|
||||
0.0014,
|
||||
0.0002,
|
||||
0.9933,
|
||||
]
|
||||
)
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_unclip_passed_image_embed(self):
|
||||
device = torch.device("cpu")
|
||||
|
||||
class DummyScheduler:
|
||||
init_noise_sigma = 1
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
dtype = pipe.decoder.dtype
|
||||
batch_size = 1
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
pipe.decoder.config.in_channels,
|
||||
pipe.decoder.config.sample_size,
|
||||
pipe.decoder.config.sample_size,
|
||||
)
|
||||
decoder_latents = pipe.prepare_latents(
|
||||
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
|
||||
)
|
||||
|
||||
shape = (
|
||||
batch_size,
|
||||
pipe.super_res_first.config.in_channels // 2,
|
||||
pipe.super_res_first.config.sample_size,
|
||||
pipe.super_res_first.config.sample_size,
|
||||
)
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
super_res_latents = pipe.prepare_latents(
|
||||
shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
|
||||
)
|
||||
|
||||
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
|
||||
|
||||
img_out_1 = pipe(
|
||||
**pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents
|
||||
).images
|
||||
|
||||
pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
|
||||
# Don't pass image, instead pass embedding
|
||||
image = pipeline_inputs.pop("image")
|
||||
image_embeddings = pipe.image_encoder(image).image_embeds
|
||||
|
||||
img_out_2 = pipe(
|
||||
**pipeline_inputs,
|
||||
decoder_latents=decoder_latents,
|
||||
super_res_latents=super_res_latents,
|
||||
image_embeddings=image_embeddings,
|
||||
).images
|
||||
|
||||
# make sure passing text embeddings manually is identical
|
||||
assert np.abs(img_out_1 - img_out_2).max() < 1e-4
|
||||
|
||||
# Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
|
||||
# because UnCLIP GPU undeterminism requires a looser check.
|
||||
@skip_mps
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
test_max_difference = torch_device == "cpu"
|
||||
|
||||
# Check is relaxed because there is not a torch 2.0 sliced attention added kv processor
|
||||
expected_max_diff = 1e-2
|
||||
|
||||
self._test_attention_slicing_forward_pass(
|
||||
test_max_difference=test_max_difference, expected_max_diff=expected_max_diff
|
||||
)
|
||||
|
||||
# Overriding PipelineTesterMixin::test_inference_batch_single_identical
|
||||
# because UnCLIP undeterminism requires a looser check.
|
||||
@unittest.skip("UnCLIP produces very large differences. Test is not useful.")
|
||||
@skip_mps
|
||||
def test_inference_batch_single_identical(self):
|
||||
additional_params_copy_to_batched_inputs = [
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
self._test_inference_batch_single_identical(
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
|
||||
)
|
||||
|
||||
def test_inference_batch_consistent(self):
|
||||
additional_params_copy_to_batched_inputs = [
|
||||
"decoder_num_inference_steps",
|
||||
"super_res_num_inference_steps",
|
||||
]
|
||||
|
||||
if torch_device == "mps":
|
||||
# TODO: MPS errors with larger batch sizes
|
||||
batch_sizes = [2, 3]
|
||||
self._test_inference_batch_consistent(
|
||||
batch_sizes=batch_sizes,
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
|
||||
)
|
||||
else:
|
||||
self._test_inference_batch_consistent(
|
||||
additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
|
||||
)
|
||||
|
||||
@skip_mps
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
return super().test_dict_tuple_outputs_equivalent()
|
||||
|
||||
@unittest.skip("UnCLIP produces very large difference. Test is not useful.")
|
||||
@skip_mps
|
||||
def test_save_load_local(self):
|
||||
return super().test_save_load_local(expected_max_difference=4e-3)
|
||||
|
||||
@skip_mps
|
||||
def test_save_load_optional_components(self):
|
||||
return super().test_save_load_optional_components()
|
||||
|
||||
@unittest.skip("UnCLIP produces very large difference in fp16 vs fp32. Test is not useful.")
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(expected_max_diff=1.0)
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# clean up the VRAM before each test
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
# clean up the VRAM after each test
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_unclip_image_variation_karlo(self):
|
||||
input_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
|
||||
)
|
||||
expected_image = load_numpy(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
|
||||
"/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
|
||||
)
|
||||
|
||||
pipeline = UnCLIPImageVariationPipeline.from_pretrained(
|
||||
"kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = pipeline.to(torch_device)
|
||||
pipeline.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
output = pipeline(
|
||||
input_image,
|
||||
generator=generator,
|
||||
output_type="np",
|
||||
)
|
||||
|
||||
image = output.images[0]
|
||||
|
||||
assert image.shape == (256, 256, 3)
|
||||
|
||||
assert_mean_pixel_difference(image, expected_image, 15)
|
||||
@@ -1,764 +0,0 @@
|
||||
import gc
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import (
|
||||
CLIPImageProcessor,
|
||||
CLIPTextModel,
|
||||
CLIPTokenizer,
|
||||
CLIPVisionModelWithProjection,
|
||||
GPT2Tokenizer,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
DPMSolverMultistepScheduler,
|
||||
UniDiffuserModel,
|
||||
UniDiffuserPipeline,
|
||||
UniDiffuserTextDecoder,
|
||||
)
|
||||
from diffusers.utils.testing_utils import (
|
||||
backend_empty_cache,
|
||||
enable_full_determinism,
|
||||
floats_tensor,
|
||||
load_image,
|
||||
nightly,
|
||||
require_torch_accelerator,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
|
||||
from ..pipeline_params import (
|
||||
IMAGE_TO_IMAGE_IMAGE_PARAMS,
|
||||
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
|
||||
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
|
||||
)
|
||||
from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class UniDiffuserPipelineFastTests(
|
||||
PipelineTesterMixin, PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
|
||||
):
|
||||
pipeline_class = UniDiffuserPipeline
|
||||
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
|
||||
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
|
||||
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
|
||||
# vae_latents, not latents, is the argument that corresponds to VAE latent inputs
|
||||
image_latents_params = frozenset(["vae_latents"])
|
||||
|
||||
supports_dduf = False
|
||||
|
||||
def get_dummy_components(self):
|
||||
unet = UniDiffuserModel.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="unet",
|
||||
)
|
||||
|
||||
scheduler = DPMSolverMultistepScheduler(
|
||||
beta_start=0.00085,
|
||||
beta_end=0.012,
|
||||
beta_schedule="scaled_linear",
|
||||
solver_order=3,
|
||||
)
|
||||
|
||||
vae = AutoencoderKL.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="vae",
|
||||
)
|
||||
|
||||
text_encoder = CLIPTextModel.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="text_encoder",
|
||||
)
|
||||
clip_tokenizer = CLIPTokenizer.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="clip_tokenizer",
|
||||
)
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="image_encoder",
|
||||
)
|
||||
# From the Stable Diffusion Image Variation pipeline tests
|
||||
clip_image_processor = CLIPImageProcessor(crop_size=32, size=32)
|
||||
# image_processor = CLIPImageProcessor.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
text_tokenizer = GPT2Tokenizer.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="text_tokenizer",
|
||||
)
|
||||
text_decoder = UniDiffuserTextDecoder.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-diffusers-test",
|
||||
subfolder="text_decoder",
|
||||
)
|
||||
|
||||
components = {
|
||||
"vae": vae,
|
||||
"text_encoder": text_encoder,
|
||||
"image_encoder": image_encoder,
|
||||
"clip_image_processor": clip_image_processor,
|
||||
"clip_tokenizer": clip_tokenizer,
|
||||
"text_decoder": text_decoder,
|
||||
"text_tokenizer": text_tokenizer,
|
||||
"unet": unet,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
image = Image.fromarray(np.uint8(image)).convert("RGB")
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "an elephant under the sea",
|
||||
"image": image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def get_fixed_latents(self, device, seed=0):
|
||||
if isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
# Hardcode the shapes for now.
|
||||
prompt_latents = randn_tensor((1, 77, 32), generator=generator, device=device, dtype=torch.float32)
|
||||
vae_latents = randn_tensor((1, 4, 16, 16), generator=generator, device=device, dtype=torch.float32)
|
||||
clip_latents = randn_tensor((1, 1, 32), generator=generator, device=device, dtype=torch.float32)
|
||||
|
||||
latents = {
|
||||
"prompt_latents": prompt_latents,
|
||||
"vae_latents": vae_latents,
|
||||
"clip_latents": clip_latents,
|
||||
}
|
||||
return latents
|
||||
|
||||
def get_dummy_inputs_with_latents(self, device, seed=0):
|
||||
# image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
|
||||
# image = image.cpu().permute(0, 2, 3, 1)[0]
|
||||
# image = Image.fromarray(np.uint8(image)).convert("RGB")
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg",
|
||||
)
|
||||
image = image.resize((32, 32))
|
||||
latents = self.get_fixed_latents(device, seed=seed)
|
||||
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
|
||||
inputs = {
|
||||
"prompt": "an elephant under the sea",
|
||||
"image": image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"guidance_scale": 6.0,
|
||||
"output_type": "np",
|
||||
"prompt_latents": latents.get("prompt_latents"),
|
||||
"vae_latents": latents.get("vae_latents"),
|
||||
"clip_latents": latents.get("clip_latents"),
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_dict_tuple_outputs_equivalent(self):
|
||||
expected_slice = None
|
||||
if torch_device == "cpu":
|
||||
expected_slice = np.array([0.7489, 0.3722, 0.4475, 0.5630, 0.5923, 0.4992, 0.3936, 0.5844, 0.4975])
|
||||
super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
|
||||
|
||||
def test_unidiffuser_default_joint_v0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'joint'
|
||||
unidiffuser_pipe.set_joint_mode()
|
||||
assert unidiffuser_pipe.mode == "joint"
|
||||
|
||||
# inputs = self.get_dummy_inputs(device)
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
sample = unidiffuser_pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_joint_no_cfg_v0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'joint'
|
||||
unidiffuser_pipe.set_joint_mode()
|
||||
assert unidiffuser_pipe.mode == "joint"
|
||||
|
||||
# inputs = self.get_dummy_inputs(device)
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
# Set guidance scale to 1.0 to turn off CFG
|
||||
inputs["guidance_scale"] = 1.0
|
||||
sample = unidiffuser_pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_text2img_v0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'text2img'
|
||||
unidiffuser_pipe.set_text_to_image_mode()
|
||||
assert unidiffuser_pipe.mode == "text2img"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete image for text-conditioned image generation
|
||||
del inputs["image"]
|
||||
image = unidiffuser_pipe(**inputs).images
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
|
||||
|
||||
def test_unidiffuser_default_image_0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img'
|
||||
unidiffuser_pipe.set_image_mode()
|
||||
assert unidiffuser_pipe.mode == "img"
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
# Delete prompt and image for unconditional ("marginal") text generation.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
image = unidiffuser_pipe(**inputs).images
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.5760, 0.6270, 0.6571, 0.4966, 0.4638, 0.5663, 0.5254, 0.5068, 0.5715])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
|
||||
|
||||
def test_unidiffuser_default_text_v0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img'
|
||||
unidiffuser_pipe.set_text_mode()
|
||||
assert unidiffuser_pipe.mode == "text"
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
# Delete prompt and image for unconditional ("marginal") text generation.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_img2text_v0(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img2text'
|
||||
unidiffuser_pipe.set_image_to_text_mode()
|
||||
assert unidiffuser_pipe.mode == "img2text"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete text for image-conditioned text generation
|
||||
del inputs["prompt"]
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_joint_v1(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'joint'
|
||||
unidiffuser_pipe.set_joint_mode()
|
||||
assert unidiffuser_pipe.mode == "joint"
|
||||
|
||||
# inputs = self.get_dummy_inputs(device)
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
inputs["data_type"] = 1
|
||||
sample = unidiffuser_pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_text2img_v1(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'text2img'
|
||||
unidiffuser_pipe.set_text_to_image_mode()
|
||||
assert unidiffuser_pipe.mode == "text2img"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete image for text-conditioned image generation
|
||||
del inputs["image"]
|
||||
image = unidiffuser_pipe(**inputs).images
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
|
||||
|
||||
def test_unidiffuser_default_img2text_v1(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1")
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img2text'
|
||||
unidiffuser_pipe.set_image_to_text_mode()
|
||||
assert unidiffuser_pipe.mode == "img2text"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete text for image-conditioned text generation
|
||||
del inputs["prompt"]
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
expected_text_prefix = " no no no "
|
||||
assert text[0][:10] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_text2img_multiple_images(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'text2img'
|
||||
unidiffuser_pipe.set_text_to_image_mode()
|
||||
assert unidiffuser_pipe.mode == "text2img"
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
# Delete image for text-conditioned image generation
|
||||
del inputs["image"]
|
||||
inputs["num_images_per_prompt"] = 2
|
||||
inputs["num_prompts_per_image"] = 3
|
||||
image = unidiffuser_pipe(**inputs).images
|
||||
assert image.shape == (2, 32, 32, 3)
|
||||
|
||||
def test_unidiffuser_img2text_multiple_prompts(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img2text'
|
||||
unidiffuser_pipe.set_image_to_text_mode()
|
||||
assert unidiffuser_pipe.mode == "img2text"
|
||||
|
||||
inputs = self.get_dummy_inputs(device)
|
||||
# Delete text for image-conditioned text generation
|
||||
del inputs["prompt"]
|
||||
inputs["num_images_per_prompt"] = 2
|
||||
inputs["num_prompts_per_image"] = 3
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
assert len(text) == 3
|
||||
|
||||
def test_unidiffuser_text2img_multiple_images_with_latents(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'text2img'
|
||||
unidiffuser_pipe.set_text_to_image_mode()
|
||||
assert unidiffuser_pipe.mode == "text2img"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete image for text-conditioned image generation
|
||||
del inputs["image"]
|
||||
inputs["num_images_per_prompt"] = 2
|
||||
inputs["num_prompts_per_image"] = 3
|
||||
image = unidiffuser_pipe(**inputs).images
|
||||
assert image.shape == (2, 32, 32, 3)
|
||||
|
||||
def test_unidiffuser_img2text_multiple_prompts_with_latents(self):
|
||||
device = "cpu" # ensure determinism for the device-dependent torch.Generator
|
||||
components = self.get_dummy_components()
|
||||
unidiffuser_pipe = UniDiffuserPipeline(**components)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img2text'
|
||||
unidiffuser_pipe.set_image_to_text_mode()
|
||||
assert unidiffuser_pipe.mode == "img2text"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(device)
|
||||
# Delete text for image-conditioned text generation
|
||||
del inputs["prompt"]
|
||||
inputs["num_images_per_prompt"] = 2
|
||||
inputs["num_prompts_per_image"] = 3
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
assert len(text) == 3
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=2e-4)
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_unidiffuser_default_joint_v1_fp16(self):
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
|
||||
)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'joint'
|
||||
unidiffuser_pipe.set_joint_mode()
|
||||
assert unidiffuser_pipe.mode == "joint"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(torch_device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
inputs["data_type"] = 1
|
||||
sample = unidiffuser_pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.5049, 0.5498, 0.5854, 0.3052, 0.4460, 0.6489, 0.5122, 0.4810, 0.6138])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
|
||||
|
||||
expected_text_prefix = '" This This'
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_unidiffuser_default_text2img_v1_fp16(self):
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
|
||||
)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'text2img'
|
||||
unidiffuser_pipe.set_text_to_image_mode()
|
||||
assert unidiffuser_pipe.mode == "text2img"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(torch_device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["image"]
|
||||
inputs["data_type"] = 1
|
||||
sample = unidiffuser_pipe(**inputs)
|
||||
image = sample.images
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_unidiffuser_default_img2text_v1_fp16(self):
|
||||
unidiffuser_pipe = UniDiffuserPipeline.from_pretrained(
|
||||
"hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16
|
||||
)
|
||||
unidiffuser_pipe = unidiffuser_pipe.to(torch_device)
|
||||
unidiffuser_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# Set mode to 'img2text'
|
||||
unidiffuser_pipe.set_image_to_text_mode()
|
||||
assert unidiffuser_pipe.mode == "img2text"
|
||||
|
||||
inputs = self.get_dummy_inputs_with_latents(torch_device)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
inputs["data_type"] = 1
|
||||
text = unidiffuser_pipe(**inputs).text
|
||||
|
||||
expected_text_prefix = '" This This'
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
@unittest.skip(
|
||||
"Test not supported because it has a bunch of direct configs at init and also, this pipeline isn't used that much now."
|
||||
)
|
||||
def test_encode_prompt_works_in_isolation():
|
||||
pass
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class UniDiffuserPipelineSlowTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, seed=0, generate_latents=False):
|
||||
generator = torch.manual_seed(seed)
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
|
||||
)
|
||||
inputs = {
|
||||
"prompt": "an elephant under the sea",
|
||||
"image": image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 8.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
if generate_latents:
|
||||
latents = self.get_fixed_latents(device, seed=seed)
|
||||
for latent_name, latent_tensor in latents.items():
|
||||
inputs[latent_name] = latent_tensor
|
||||
return inputs
|
||||
|
||||
def get_fixed_latents(self, device, seed=0):
|
||||
if isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
latent_device = torch.device("cpu")
|
||||
generator = torch.Generator(device=latent_device).manual_seed(seed)
|
||||
# Hardcode the shapes for now.
|
||||
prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
|
||||
vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
|
||||
clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
|
||||
|
||||
# Move latents onto desired device.
|
||||
prompt_latents = prompt_latents.to(device)
|
||||
vae_latents = vae_latents.to(device)
|
||||
clip_latents = clip_latents.to(device)
|
||||
|
||||
latents = {
|
||||
"prompt_latents": prompt_latents,
|
||||
"vae_latents": vae_latents,
|
||||
"clip_latents": clip_latents,
|
||||
}
|
||||
return latents
|
||||
|
||||
def test_unidiffuser_default_joint_v1(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
# inputs = self.get_dummy_inputs(device)
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
sample = pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1
|
||||
|
||||
expected_text_prefix = "a living room"
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_text2img_v1(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
del inputs["image"]
|
||||
sample = pipe(**inputs)
|
||||
image = sample.images
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
|
||||
|
||||
def test_unidiffuser_default_img2text_v1(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1")
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
del inputs["prompt"]
|
||||
sample = pipe(**inputs)
|
||||
text = sample.text
|
||||
|
||||
expected_text_prefix = "An astronaut"
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
|
||||
@nightly
|
||||
@require_torch_accelerator
|
||||
class UniDiffuserPipelineNightlyTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def get_inputs(self, device, seed=0, generate_latents=False):
|
||||
generator = torch.manual_seed(seed)
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg"
|
||||
)
|
||||
inputs = {
|
||||
"prompt": "an elephant under the sea",
|
||||
"image": image,
|
||||
"generator": generator,
|
||||
"num_inference_steps": 3,
|
||||
"guidance_scale": 8.0,
|
||||
"output_type": "np",
|
||||
}
|
||||
if generate_latents:
|
||||
latents = self.get_fixed_latents(device, seed=seed)
|
||||
for latent_name, latent_tensor in latents.items():
|
||||
inputs[latent_name] = latent_tensor
|
||||
return inputs
|
||||
|
||||
def get_fixed_latents(self, device, seed=0):
|
||||
if isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
latent_device = torch.device("cpu")
|
||||
generator = torch.Generator(device=latent_device).manual_seed(seed)
|
||||
# Hardcode the shapes for now.
|
||||
prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32)
|
||||
vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32)
|
||||
clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32)
|
||||
|
||||
# Move latents onto desired device.
|
||||
prompt_latents = prompt_latents.to(device)
|
||||
vae_latents = vae_latents.to(device)
|
||||
clip_latents = clip_latents.to(device)
|
||||
|
||||
latents = {
|
||||
"prompt_latents": prompt_latents,
|
||||
"vae_latents": vae_latents,
|
||||
"clip_latents": clip_latents,
|
||||
}
|
||||
return latents
|
||||
|
||||
def test_unidiffuser_default_joint_v1_fp16(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
# inputs = self.get_dummy_inputs(device)
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
# Delete prompt and image for joint inference.
|
||||
del inputs["prompt"]
|
||||
del inputs["image"]
|
||||
sample = pipe(**inputs)
|
||||
image = sample.images
|
||||
text = sample.text
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520])
|
||||
assert np.abs(image_slice.flatten() - expected_img_slice).max() < 2e-1
|
||||
|
||||
expected_text_prefix = "a living room"
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
|
||||
def test_unidiffuser_default_text2img_v1_fp16(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
del inputs["image"]
|
||||
sample = pipe(**inputs)
|
||||
image = sample.images
|
||||
assert image.shape == (1, 512, 512, 3)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
|
||||
|
||||
def test_unidiffuser_default_img2text_v1_fp16(self):
|
||||
pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16)
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.enable_attention_slicing()
|
||||
|
||||
inputs = self.get_inputs(device=torch_device, generate_latents=True)
|
||||
del inputs["prompt"]
|
||||
sample = pipe(**inputs)
|
||||
text = sample.text
|
||||
|
||||
expected_text_prefix = "An astronaut"
|
||||
assert text[0][: len(expected_text_prefix)] == expected_text_prefix
|
||||
@@ -1,241 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import DDPMWuerstchenScheduler, WuerstchenCombinedPipeline
|
||||
from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt, WuerstchenPrior
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, require_torch_accelerator, torch_device
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class WuerstchenCombinedPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = WuerstchenCombinedPipeline
|
||||
params = ["prompt"]
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
required_optional_params = [
|
||||
"generator",
|
||||
"height",
|
||||
"width",
|
||||
"latents",
|
||||
"prior_guidance_scale",
|
||||
"decoder_guidance_scale",
|
||||
"negative_prompt",
|
||||
"num_inference_steps",
|
||||
"return_dict",
|
||||
"prior_num_inference_steps",
|
||||
"output_type",
|
||||
]
|
||||
test_xformers_attention = True
|
||||
|
||||
@property
|
||||
def text_embedder_hidden_size(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def dummy_prior(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {"c_in": 2, "c": 8, "depth": 2, "c_cond": 32, "c_r": 8, "nhead": 2}
|
||||
model = WuerstchenPrior(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
@property
|
||||
def dummy_tokenizer(self):
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
return tokenizer
|
||||
|
||||
@property
|
||||
def dummy_prior_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config).eval()
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
projection_dim=self.text_embedder_hidden_size,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config).eval()
|
||||
|
||||
@property
|
||||
def dummy_vqgan(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"bottleneck_blocks": 1,
|
||||
"num_vq_embeddings": 2,
|
||||
}
|
||||
model = PaellaVQModel(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
@property
|
||||
def dummy_decoder(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"c_cond": self.text_embedder_hidden_size,
|
||||
"c_hidden": [320],
|
||||
"nhead": [-1],
|
||||
"blocks": [4],
|
||||
"level_config": ["CT"],
|
||||
"clip_embd": self.text_embedder_hidden_size,
|
||||
"inject_effnet": [False],
|
||||
}
|
||||
|
||||
model = WuerstchenDiffNeXt(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
def get_dummy_components(self):
|
||||
prior = self.dummy_prior
|
||||
prior_text_encoder = self.dummy_prior_text_encoder
|
||||
|
||||
scheduler = DDPMWuerstchenScheduler()
|
||||
tokenizer = self.dummy_tokenizer
|
||||
|
||||
text_encoder = self.dummy_text_encoder
|
||||
decoder = self.dummy_decoder
|
||||
vqgan = self.dummy_vqgan
|
||||
|
||||
components = {
|
||||
"tokenizer": tokenizer,
|
||||
"text_encoder": text_encoder,
|
||||
"decoder": decoder,
|
||||
"vqgan": vqgan,
|
||||
"scheduler": scheduler,
|
||||
"prior_prior": prior,
|
||||
"prior_text_encoder": prior_text_encoder,
|
||||
"prior_tokenizer": tokenizer,
|
||||
"prior_scheduler": scheduler,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "horse",
|
||||
"generator": generator,
|
||||
"prior_guidance_scale": 4.0,
|
||||
"decoder_guidance_scale": 4.0,
|
||||
"num_inference_steps": 2,
|
||||
"prior_num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
"height": 128,
|
||||
"width": 128,
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_wuerstchen(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(device))
|
||||
image = output.images
|
||||
|
||||
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[-3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
|
||||
expected_slice = np.array([0.7616304, 0.0, 1.0, 0.0, 1.0, 0.0, 0.05925313, 0.0, 0.951898])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {expected_slice}, but got {image_slice.flatten()}"
|
||||
)
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2, (
|
||||
f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}"
|
||||
)
|
||||
|
||||
@require_torch_accelerator
|
||||
def test_offloads(self):
|
||||
pipes = []
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components).to(torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components)
|
||||
sd_pipe.enable_sequential_cpu_offload(device=torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
components = self.get_dummy_components()
|
||||
sd_pipe = self.pipeline_class(**components)
|
||||
sd_pipe.enable_model_cpu_offload(device=torch_device)
|
||||
pipes.append(sd_pipe)
|
||||
|
||||
image_slices = []
|
||||
for pipe in pipes:
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
image = pipe(**inputs).images
|
||||
|
||||
image_slices.append(image[0, -3:, -3:, -1].flatten())
|
||||
|
||||
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
assert np.abs(image_slices[0] - image_slices[2]).max() < 1e-3
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=1e-2)
|
||||
|
||||
@unittest.skip(reason="flakey and float16 requires CUDA")
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference()
|
||||
|
||||
@unittest.skip(reason="Test not supported.")
|
||||
def test_callback_inputs(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Test not supported.")
|
||||
def test_callback_cfg(self):
|
||||
pass
|
||||
@@ -1,192 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import DDPMWuerstchenScheduler, WuerstchenDecoderPipeline
|
||||
from diffusers.pipelines.wuerstchen import PaellaVQModel, WuerstchenDiffNeXt
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class WuerstchenDecoderPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = WuerstchenDecoderPipeline
|
||||
params = ["prompt"]
|
||||
batch_params = ["image_embeddings", "prompt", "negative_prompt"]
|
||||
required_optional_params = [
|
||||
"num_images_per_prompt",
|
||||
"num_inference_steps",
|
||||
"latents",
|
||||
"negative_prompt",
|
||||
"guidance_scale",
|
||||
"output_type",
|
||||
"return_dict",
|
||||
]
|
||||
test_xformers_attention = False
|
||||
callback_cfg_params = ["image_embeddings", "text_encoder_hidden_states"]
|
||||
|
||||
@property
|
||||
def text_embedder_hidden_size(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def time_input_dim(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def block_out_channels_0(self):
|
||||
return self.time_input_dim
|
||||
|
||||
@property
|
||||
def time_embed_dim(self):
|
||||
return self.time_input_dim * 4
|
||||
|
||||
@property
|
||||
def dummy_tokenizer(self):
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
return tokenizer
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
projection_dim=self.text_embedder_hidden_size,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config).eval()
|
||||
|
||||
@property
|
||||
def dummy_vqgan(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"bottleneck_blocks": 1,
|
||||
"num_vq_embeddings": 2,
|
||||
}
|
||||
model = PaellaVQModel(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
@property
|
||||
def dummy_decoder(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"c_cond": self.text_embedder_hidden_size,
|
||||
"c_hidden": [320],
|
||||
"nhead": [-1],
|
||||
"blocks": [4],
|
||||
"level_config": ["CT"],
|
||||
"clip_embd": self.text_embedder_hidden_size,
|
||||
"inject_effnet": [False],
|
||||
}
|
||||
|
||||
model = WuerstchenDiffNeXt(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
def get_dummy_components(self):
|
||||
decoder = self.dummy_decoder
|
||||
text_encoder = self.dummy_text_encoder
|
||||
tokenizer = self.dummy_tokenizer
|
||||
vqgan = self.dummy_vqgan
|
||||
|
||||
scheduler = DDPMWuerstchenScheduler()
|
||||
|
||||
components = {
|
||||
"decoder": decoder,
|
||||
"vqgan": vqgan,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"scheduler": scheduler,
|
||||
"latent_dim_scale": 4.0,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"image_embeddings": torch.ones((1, 4, 4, 4), device=device),
|
||||
"prompt": "horse",
|
||||
"generator": generator,
|
||||
"guidance_scale": 1.0,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_wuerstchen_decoder(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(device))
|
||||
image = output.images
|
||||
|
||||
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
expected_slice = np.array([0.0000, 0.0000, 0.0089, 1.0000, 1.0000, 0.3927, 1.0000, 1.0000, 1.0000])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@skip_mps
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(expected_max_diff=1e-5)
|
||||
|
||||
@skip_mps
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
test_max_difference = torch_device == "cpu"
|
||||
test_mean_pixel_difference = False
|
||||
|
||||
self._test_attention_slicing_forward_pass(
|
||||
test_max_difference=test_max_difference,
|
||||
test_mean_pixel_difference=test_mean_pixel_difference,
|
||||
)
|
||||
|
||||
@unittest.skip(reason="bf16 not supported and requires CUDA")
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference()
|
||||
|
||||
@unittest.skip("Test not supported.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
super().test_encode_prompt_works_in_isolation()
|
||||
@@ -1,273 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
|
||||
|
||||
from diffusers import DDPMWuerstchenScheduler, WuerstchenPriorPipeline
|
||||
from diffusers.pipelines.wuerstchen import WuerstchenPrior
|
||||
from diffusers.utils.import_utils import is_peft_available
|
||||
from diffusers.utils.testing_utils import enable_full_determinism, require_peft_backend, skip_mps, torch_device
|
||||
|
||||
|
||||
if is_peft_available():
|
||||
from peft import LoraConfig
|
||||
from peft.tuners.tuners_utils import BaseTunerLayer
|
||||
|
||||
from ..test_pipelines_common import PipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class WuerstchenPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
pipeline_class = WuerstchenPriorPipeline
|
||||
params = ["prompt"]
|
||||
batch_params = ["prompt", "negative_prompt"]
|
||||
required_optional_params = [
|
||||
"num_images_per_prompt",
|
||||
"generator",
|
||||
"num_inference_steps",
|
||||
"latents",
|
||||
"negative_prompt",
|
||||
"guidance_scale",
|
||||
"output_type",
|
||||
"return_dict",
|
||||
]
|
||||
test_xformers_attention = False
|
||||
callback_cfg_params = ["text_encoder_hidden_states"]
|
||||
|
||||
@property
|
||||
def text_embedder_hidden_size(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def time_input_dim(self):
|
||||
return 32
|
||||
|
||||
@property
|
||||
def block_out_channels_0(self):
|
||||
return self.time_input_dim
|
||||
|
||||
@property
|
||||
def time_embed_dim(self):
|
||||
return self.time_input_dim * 4
|
||||
|
||||
@property
|
||||
def dummy_tokenizer(self):
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
return tokenizer
|
||||
|
||||
@property
|
||||
def dummy_text_encoder(self):
|
||||
torch.manual_seed(0)
|
||||
config = CLIPTextConfig(
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
hidden_size=self.text_embedder_hidden_size,
|
||||
intermediate_size=37,
|
||||
layer_norm_eps=1e-05,
|
||||
num_attention_heads=4,
|
||||
num_hidden_layers=5,
|
||||
pad_token_id=1,
|
||||
vocab_size=1000,
|
||||
)
|
||||
return CLIPTextModel(config).eval()
|
||||
|
||||
@property
|
||||
def dummy_prior(self):
|
||||
torch.manual_seed(0)
|
||||
|
||||
model_kwargs = {
|
||||
"c_in": 2,
|
||||
"c": 8,
|
||||
"depth": 2,
|
||||
"c_cond": 32,
|
||||
"c_r": 8,
|
||||
"nhead": 2,
|
||||
}
|
||||
|
||||
model = WuerstchenPrior(**model_kwargs)
|
||||
return model.eval()
|
||||
|
||||
def get_dummy_components(self):
|
||||
prior = self.dummy_prior
|
||||
text_encoder = self.dummy_text_encoder
|
||||
tokenizer = self.dummy_tokenizer
|
||||
|
||||
scheduler = DDPMWuerstchenScheduler()
|
||||
|
||||
components = {
|
||||
"prior": prior,
|
||||
"text_encoder": text_encoder,
|
||||
"tokenizer": tokenizer,
|
||||
"scheduler": scheduler,
|
||||
}
|
||||
|
||||
return components
|
||||
|
||||
def get_dummy_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device=device).manual_seed(seed)
|
||||
inputs = {
|
||||
"prompt": "horse",
|
||||
"generator": generator,
|
||||
"guidance_scale": 4.0,
|
||||
"num_inference_steps": 2,
|
||||
"output_type": "np",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_wuerstchen_prior(self):
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(device))
|
||||
image = output.image_embeddings
|
||||
|
||||
image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]
|
||||
|
||||
image_slice = image[0, 0, 0, -10:]
|
||||
image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
|
||||
assert image.shape == (1, 2, 24, 24)
|
||||
|
||||
expected_slice = np.array(
|
||||
[
|
||||
-7172.837,
|
||||
-3438.855,
|
||||
-1093.312,
|
||||
388.8835,
|
||||
-7471.467,
|
||||
-7998.1206,
|
||||
-5328.259,
|
||||
218.00089,
|
||||
-2731.5745,
|
||||
-8056.734,
|
||||
]
|
||||
)
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-2
|
||||
|
||||
@skip_mps
|
||||
def test_inference_batch_single_identical(self):
|
||||
self._test_inference_batch_single_identical(
|
||||
expected_max_diff=3e-1,
|
||||
)
|
||||
|
||||
@skip_mps
|
||||
def test_attention_slicing_forward_pass(self):
|
||||
test_max_difference = torch_device == "cpu"
|
||||
test_mean_pixel_difference = False
|
||||
|
||||
self._test_attention_slicing_forward_pass(
|
||||
test_max_difference=test_max_difference,
|
||||
test_mean_pixel_difference=test_mean_pixel_difference,
|
||||
)
|
||||
|
||||
@unittest.skip(reason="flaky for now")
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference()
|
||||
|
||||
# override because we need to make sure latent_mean and latent_std to be 0
|
||||
def test_callback_inputs(self):
|
||||
components = self.get_dummy_components()
|
||||
components["latent_mean"] = 0
|
||||
components["latent_std"] = 0
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
self.assertTrue(
|
||||
hasattr(pipe, "_callback_tensor_inputs"),
|
||||
f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
|
||||
)
|
||||
|
||||
def callback_inputs_test(pipe, i, t, callback_kwargs):
|
||||
missing_callback_inputs = set()
|
||||
for v in pipe._callback_tensor_inputs:
|
||||
if v not in callback_kwargs:
|
||||
missing_callback_inputs.add(v)
|
||||
self.assertTrue(
|
||||
len(missing_callback_inputs) == 0, f"Missing callback tensor inputs: {missing_callback_inputs}"
|
||||
)
|
||||
last_i = pipe.num_timesteps - 1
|
||||
if i == last_i:
|
||||
callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
|
||||
return callback_kwargs
|
||||
|
||||
inputs = self.get_dummy_inputs(torch_device)
|
||||
inputs["callback_on_step_end"] = callback_inputs_test
|
||||
inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
|
||||
inputs["output_type"] = "latent"
|
||||
|
||||
output = pipe(**inputs)[0]
|
||||
assert output.abs().sum() == 0
|
||||
|
||||
def check_if_lora_correctly_set(self, model) -> bool:
|
||||
"""
|
||||
Checks if the LoRA layers are correctly set with peft
|
||||
"""
|
||||
for module in model.modules():
|
||||
if isinstance(module, BaseTunerLayer):
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_lora_components(self):
|
||||
prior = self.dummy_prior
|
||||
|
||||
prior_lora_config = LoraConfig(
|
||||
r=4, lora_alpha=4, target_modules=["to_q", "to_k", "to_v", "to_out.0"], init_lora_weights=False
|
||||
)
|
||||
|
||||
return prior, prior_lora_config
|
||||
|
||||
@require_peft_backend
|
||||
def test_inference_with_prior_lora(self):
|
||||
_, prior_lora_config = self.get_lora_components()
|
||||
device = "cpu"
|
||||
|
||||
components = self.get_dummy_components()
|
||||
|
||||
pipe = self.pipeline_class(**components)
|
||||
pipe = pipe.to(device)
|
||||
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
output_no_lora = pipe(**self.get_dummy_inputs(device))
|
||||
image_embed = output_no_lora.image_embeddings
|
||||
self.assertTrue(image_embed.shape == (1, 2, 24, 24))
|
||||
|
||||
pipe.prior.add_adapter(prior_lora_config)
|
||||
self.assertTrue(self.check_if_lora_correctly_set(pipe.prior), "Lora not correctly set in prior")
|
||||
|
||||
output_lora = pipe(**self.get_dummy_inputs(device))
|
||||
lora_image_embed = output_lora.image_embeddings
|
||||
|
||||
self.assertTrue(image_embed.shape == lora_image_embed.shape)
|
||||
|
||||
@unittest.skip("Test not supported as dtype cannot be inferred without the text encoder otherwise.")
|
||||
def test_encode_prompt_works_in_isolation(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user