1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-29 07:22:12 +03:00

StableDiffusionInpaintingPipeline - resize image w.r.t height and width (#3322)

* StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy.

* Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests

Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution

* Added a resolution test to StableDiffusionInpaintPipelineSlowTests

this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width

---------

Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
Rupert Menneer
2023-05-10 19:14:25 +01:00
committed by GitHub
parent 94a0c644a8
commit edb087a217
2 changed files with 104 additions and 57 deletions

View File

@@ -36,7 +36,7 @@ from .safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
def prepare_mask_and_masked_image(image, mask):
def prepare_mask_and_masked_image(image, mask, height, width):
"""
Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
@@ -64,6 +64,13 @@ def prepare_mask_and_masked_image(image, mask):
tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
dimensions: ``batch x channels x height x width``.
"""
if image is None:
raise ValueError("`image` input cannot be undefined.")
if mask is None:
raise ValueError("`mask_image` input cannot be undefined.")
if isinstance(image, torch.Tensor):
if not isinstance(mask, torch.Tensor):
raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
@@ -111,8 +118,9 @@ def prepare_mask_and_masked_image(image, mask):
# preprocess image
if isinstance(image, (PIL.Image.Image, np.ndarray)):
image = [image]
if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
# resize all images w.r.t passed height an width
image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
image = [np.array(i.convert("RGB"))[None, :] for i in image]
image = np.concatenate(image, axis=0)
elif isinstance(image, list) and isinstance(image[0], np.ndarray):
@@ -126,6 +134,7 @@ def prepare_mask_and_masked_image(image, mask):
mask = [mask]
if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
mask = mask.astype(np.float32) / 255.0
elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
@@ -799,12 +808,6 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
negative_prompt_embeds,
)
if image is None:
raise ValueError("`image` input cannot be undefined.")
if mask_image is None:
raise ValueError("`mask_image` input cannot be undefined.")
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
@@ -830,8 +833,8 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi
negative_prompt_embeds=negative_prompt_embeds,
)
# 4. Preprocess mask and image
mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
# 4. Preprocess mask and image - resizes image and mask w.r.t height and width
mask, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width)
# 5. set timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)

View File

@@ -303,6 +303,25 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
assert np.abs(expected_slice - image_slice).max() < 1e-4
assert np.abs(expected_slice - image_slice).max() < 1e-3
def test_stable_diffusion_inpaint_pil_input_resolution_test(self):
pipe = StableDiffusionInpaintPipeline.from_pretrained(
"runwayml/stable-diffusion-inpainting", safety_checker=None
)
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
pipe.enable_attention_slicing()
inputs = self.get_inputs(torch_device)
# change input image to a random size (one that would cause a tensor mismatch error)
inputs['image'] = inputs['image'].resize((127,127))
inputs['mask_image'] = inputs['mask_image'].resize((127,127))
inputs['height'] = 128
inputs['width'] = 128
image = pipe(**inputs).images
# verify that the returned image has the same height and width as the input height and width
assert image.shape == (1, inputs['height'], inputs['width'], 3)
@nightly
@require_torch_gpu
@@ -400,12 +419,13 @@ class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
def test_pil_inputs(self):
im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
height, width = 32, 32
im = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
im = Image.fromarray(im)
mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
mask = np.random.randint(0, 255, (height, width), dtype=np.uint8) > 127.5
mask = Image.fromarray((mask * 255).astype(np.uint8))
t_mask, t_masked = prepare_mask_and_masked_image(im, mask)
t_mask, t_masked = prepare_mask_and_masked_image(im, mask, height, width)
self.assertTrue(isinstance(t_mask, torch.Tensor))
self.assertTrue(isinstance(t_masked, torch.Tensor))
@@ -413,8 +433,8 @@ class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase)
self.assertEqual(t_mask.ndim, 4)
self.assertEqual(t_masked.ndim, 4)
self.assertEqual(t_mask.shape, (1, 1, 32, 32))
self.assertEqual(t_masked.shape, (1, 3, 32, 32))
self.assertEqual(t_mask.shape, (1, 1, height, width))
self.assertEqual(t_masked.shape, (1, 3, height, width))
self.assertTrue(t_mask.dtype == torch.float32)
self.assertTrue(t_masked.dtype == torch.float32)
@@ -427,86 +447,100 @@ class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase)
self.assertTrue(t_mask.sum() > 0.0)
def test_np_inputs(self):
im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
height, width = 32, 32
im_np = np.random.randint(0, 255, (height, width, 3), dtype=np.uint8)
im_pil = Image.fromarray(im_np)
mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
mask_np = np.random.randint(0, 255, (height, width,), dtype=np.uint8) > 127.5
mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil, height, width)
self.assertTrue((t_mask_np == t_mask_pil).all())
self.assertTrue((t_masked_np == t_masked_pil).all())
def test_torch_3D_2D_inputs(self):
im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_3D_3D_inputs(self):
im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy().transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_2D_inputs(self):
im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_3D_inputs(self):
im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_4D_4D_inputs(self):
im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, 1, 32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (1, 3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (1, 1, height, width,), dtype=torch.uint8) > 127.5
im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
mask_np = mask_tensor.numpy()[0][0]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np, height, width)
self.assertTrue((t_mask_tensor == t_mask_np).all())
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_batch_4D_3D(self):
im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (2, 32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (2, height, width,), dtype=torch.uint8) > 127.5
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy() for mask in mask_tensor]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -514,14 +548,16 @@ class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase)
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_torch_batch_4D_4D(self):
im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (2, 1, 32, 32), dtype=torch.uint8) > 127.5
height, width = 32, 32
im_tensor = torch.randint(0, 255, (2, 3, height, width,), dtype=torch.uint8)
mask_tensor = torch.randint(0, 255, (2, 1, height, width,), dtype=torch.uint8) > 127.5
im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
mask_nps = [mask.numpy()[0] for mask in mask_tensor]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor, height, width)
nps = [prepare_mask_and_masked_image(i, m, height, width) for i, m in zip(im_nps, mask_nps)]
t_mask_np = torch.cat([n[0] for n in nps])
t_masked_np = torch.cat([n[1] for n in nps])
@@ -529,39 +565,47 @@ class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase)
self.assertTrue((t_masked_tensor == t_masked_np).all())
def test_shape_mismatch(self):
height, width = 32, 32
# test height and width
with self.assertRaises(AssertionError):
prepare_mask_and_masked_image(torch.randn(3, 32, 32), torch.randn(64, 64))
prepare_mask_and_masked_image(torch.randn(3, height, width,), torch.randn(64, 64), height, width)
# test batch dim
with self.assertRaises(AssertionError):
prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 64, 64))
prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 64, 64), height, width)
# test batch dim
with self.assertRaises(AssertionError):
prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 1, 64, 64))
prepare_mask_and_masked_image(torch.randn(2, 3, height, width,), torch.randn(4, 1, 64, 64), height, width)
def test_type_mismatch(self):
height, width = 32, 32
# test tensors-only
with self.assertRaises(TypeError):
prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.rand(3, 32, 32).numpy())
prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.rand(3, height, width,).numpy(), height, width)
# test tensors-only
with self.assertRaises(TypeError):
prepare_mask_and_masked_image(torch.rand(3, 32, 32).numpy(), torch.rand(3, 32, 32))
prepare_mask_and_masked_image(torch.rand(3, height, width,).numpy(), torch.rand(3, height, width,), height, width)
def test_channels_first(self):
height, width = 32, 32
# test channels first for 3D tensors
with self.assertRaises(AssertionError):
prepare_mask_and_masked_image(torch.rand(32, 32, 3), torch.rand(3, 32, 32))
prepare_mask_and_masked_image(torch.rand(height, width, 3), torch.rand(3, height, width,), height, width)
def test_tensor_range(self):
height, width = 32, 32
# test im <= 1
with self.assertRaises(ValueError):
prepare_mask_and_masked_image(torch.ones(3, 32, 32) * 2, torch.rand(32, 32))
prepare_mask_and_masked_image(torch.ones(3, height, width,) * 2, torch.rand(height, width,), height, width)
# test im >= -1
with self.assertRaises(ValueError):
prepare_mask_and_masked_image(torch.ones(3, 32, 32) * (-2), torch.rand(32, 32))
prepare_mask_and_masked_image(torch.ones(3, height, width,) * (-2), torch.rand(height, width,), height, width)
# test mask <= 1
with self.assertRaises(ValueError):
prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2)
prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * 2, height, width)
# test mask >= 0
with self.assertRaises(ValueError):
prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)
prepare_mask_and_masked_image(torch.rand(3, height, width,), torch.ones(height, width,) * -1, height, width)