mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
Make height and width optional (#1401)
* fix * add test * fix test * uP * up * fix some tests
This commit is contained in:
committed by
GitHub
parent
81d8f4a9e1
commit
e0e86b7470
@@ -390,8 +390,8 @@ class AltDiffusionPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -411,9 +411,9 @@ class AltDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -459,6 +459,9 @@ class AltDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, height, width, callback_steps)
|
||||
|
||||
@@ -65,8 +65,8 @@ class LDMTextToImagePipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: Optional[int] = 256,
|
||||
width: Optional[int] = 256,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 1.0,
|
||||
eta: Optional[float] = 0.0,
|
||||
@@ -79,9 +79,9 @@ class LDMTextToImagePipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 256):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 256):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -106,6 +106,9 @@ class LDMTextToImagePipeline(DiffusionPipeline):
|
||||
`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
|
||||
generated images.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
|
||||
@@ -160,13 +160,17 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
|
||||
params: Union[Dict, FrozenDict],
|
||||
prng_seed: jax.random.PRNGKey,
|
||||
num_inference_steps: int = 50,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
guidance_scale: float = 7.5,
|
||||
latents: Optional[jnp.array] = None,
|
||||
debug: bool = False,
|
||||
neg_prompt_ids: jnp.array = None,
|
||||
):
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
if height % 8 != 0 or width % 8 != 0:
|
||||
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
|
||||
|
||||
@@ -249,8 +253,8 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
|
||||
params: Union[Dict, FrozenDict],
|
||||
prng_seed: jax.random.PRNGKey,
|
||||
num_inference_steps: int = 50,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
guidance_scale: float = 7.5,
|
||||
latents: jnp.array = None,
|
||||
return_dict: bool = True,
|
||||
@@ -265,9 +269,9 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -302,6 +306,10 @@ class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
|
||||
element is a list of `bool`s denoting whether the corresponding generated image likely represents
|
||||
"not-safe-for-work" (nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
if jit:
|
||||
images = _p_generate(
|
||||
self,
|
||||
|
||||
@@ -172,8 +172,8 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -187,6 +187,10 @@ class OnnxStableDiffusionPipeline(DiffusionPipeline):
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
|
||||
@@ -236,8 +236,8 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
prompt: Union[str, List[str]],
|
||||
image: PIL.Image.Image,
|
||||
mask_image: PIL.Image.Image,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -265,9 +265,9 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
|
||||
to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
|
||||
instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -312,6 +312,10 @@ class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
|
||||
@@ -389,8 +389,8 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -410,9 +410,9 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -458,6 +458,9 @@ class StableDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, height, width, callback_steps)
|
||||
|
||||
@@ -292,8 +292,8 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
@@ -315,9 +315,9 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
configuration of
|
||||
[this](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json)
|
||||
`CLIPFeatureExtractor`
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -360,6 +360,9 @@ class StableDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(image, height, width, callback_steps)
|
||||
|
||||
@@ -509,8 +509,8 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
prompt: Union[str, List[str]],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -538,9 +538,9 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
|
||||
to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
|
||||
instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -586,6 +586,9 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, height, width, callback_steps)
|
||||
|
||||
@@ -495,8 +495,8 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -521,9 +521,9 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -589,6 +589,9 @@ class StableDiffusionPipelineSafe(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.unet.config.sample_size * 8
|
||||
width = width or self.unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, height, width, callback_steps)
|
||||
|
||||
@@ -111,8 +111,8 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
def image_variation(
|
||||
self,
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -131,9 +131,9 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
|
||||
The image prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -193,7 +193,7 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
>>> pipe = pipe.to("cuda")
|
||||
|
||||
>>> generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
>>> image = pipe(image, generator=generator).images[0]
|
||||
>>> image = pipe.image_variation(image, generator=generator).images[0]
|
||||
>>> image.save("./car_variation.png")
|
||||
```
|
||||
|
||||
@@ -227,8 +227,8 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
def text_to_image(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -247,9 +247,9 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -341,8 +341,8 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
|
||||
image: Union[str, List[str]],
|
||||
text_to_image_strength: float = 0.5,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
@@ -360,9 +360,9 @@ class VersatileDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
|
||||
@@ -454,8 +454,8 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
|
||||
prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
|
||||
image: Union[str, List[str]],
|
||||
text_to_image_strength: float = 0.5,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
num_images_per_prompt: Optional[int] = 1,
|
||||
@@ -474,9 +474,9 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -551,6 +551,9 @@ class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
|
||||
[`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
|
||||
returning a tuple, the first element is a list with the generated images.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.image_unet.config.sample_size * 8
|
||||
width = width or self.image_unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, image, height, width, callback_steps)
|
||||
|
||||
@@ -297,8 +297,8 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -318,9 +318,9 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
|
||||
The image prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -391,6 +391,9 @@ class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.image_unet.config.sample_size * 8
|
||||
width = width or self.image_unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(image, height, width, callback_steps)
|
||||
|
||||
@@ -357,8 +357,8 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
num_inference_steps: int = 50,
|
||||
guidance_scale: float = 7.5,
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -378,9 +378,9 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
height (`int`, *optional*, defaults to 512):
|
||||
height (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The height in pixels of the generated image.
|
||||
width (`int`, *optional*, defaults to 512):
|
||||
width (`int`, *optional*, defaults to self.image_unet.config.sample_size * 8):
|
||||
The width in pixels of the generated image.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
@@ -443,6 +443,9 @@ class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
# 0. Default height and width to unet
|
||||
height = height or self.image_unet.config.sample_size * 8
|
||||
width = width or self.image_unet.config.sample_size * 8
|
||||
|
||||
# 1. Check inputs. Raise error if not correct
|
||||
self.check_inputs(prompt, height, width, callback_steps)
|
||||
|
||||
@@ -171,10 +171,8 @@ class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
expected_slice = np.array(
|
||||
[0.49249017, 0.46064827, 0.4790093, 0.50883967, 0.4811985, 0.51540506, 0.5084924, 0.4860553, 0.47318557]
|
||||
)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -220,10 +218,8 @@ class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
expected_slice = np.array(
|
||||
[0.4786532, 0.45791715, 0.47507674, 0.50763345, 0.48375353, 0.515062, 0.51244247, 0.48673993, 0.47105807]
|
||||
)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@@ -259,7 +255,7 @@ class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
image = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@slow
|
||||
|
||||
@@ -207,9 +207,10 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
print(", ".join(image_slice.flatten().tolist()))
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -302,9 +303,10 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
print(", ".join(image_slice.flatten().tolist()))
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -368,9 +370,10 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
print(", ".join(image_slice.flatten().tolist()))
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -413,9 +416,10 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
print(", ".join(image_slice.flatten().tolist()))
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -458,9 +462,10 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)[0]
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
print(", ".join(image_slice.flatten().tolist()))
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5067, 0.4689, 0.4614, 0.5233, 0.4903, 0.5112, 0.524, 0.5069, 0.4785])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -533,7 +538,7 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.4851, 0.4617, 0.4765, 0.5127, 0.4845, 0.5153, 0.5141, 0.4886, 0.4719])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
@@ -563,13 +568,13 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
# test num_images_per_prompt=1 (default)
|
||||
images = sd_pipe(prompt, num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert images.shape == (1, 128, 128, 3)
|
||||
assert images.shape == (1, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt=1 (default) for batch of prompts
|
||||
batch_size = 2
|
||||
images = sd_pipe([prompt] * batch_size, num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert images.shape == (batch_size, 128, 128, 3)
|
||||
assert images.shape == (batch_size, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt for single prompt
|
||||
num_images_per_prompt = 2
|
||||
@@ -577,7 +582,7 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
prompt, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
|
||||
).images
|
||||
|
||||
assert images.shape == (num_images_per_prompt, 128, 128, 3)
|
||||
assert images.shape == (num_images_per_prompt, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt for batch of prompts
|
||||
batch_size = 2
|
||||
@@ -585,7 +590,7 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
[prompt] * batch_size, num_inference_steps=2, output_type="np", num_images_per_prompt=num_images_per_prompt
|
||||
).images
|
||||
|
||||
assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
|
||||
assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
|
||||
|
||||
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
|
||||
def test_stable_diffusion_fp16(self):
|
||||
@@ -618,7 +623,7 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
def test_stable_diffusion_long_prompt(self):
|
||||
unet = self.dummy_cond_unet
|
||||
@@ -671,6 +676,43 @@ class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
assert cap_logger.out.count("@") == 25
|
||||
assert cap_logger_3.out == ""
|
||||
|
||||
def test_stable_diffusion_height_width_opt(self):
|
||||
unet = self.dummy_cond_unet
|
||||
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
|
||||
vae = self.dummy_vae
|
||||
bert = self.dummy_text_encoder
|
||||
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
|
||||
|
||||
# make sure here that pndm scheduler skips prk
|
||||
sd_pipe = StableDiffusionPipeline(
|
||||
unet=unet,
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
text_encoder=bert,
|
||||
tokenizer=tokenizer,
|
||||
safety_checker=None,
|
||||
feature_extractor=self.dummy_extractor,
|
||||
)
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
prompt = "hey"
|
||||
|
||||
output = sd_pipe(prompt, number_of_steps=2, output_type="np")
|
||||
image_shape = output.images[0].shape[:2]
|
||||
assert image_shape == [32, 32]
|
||||
|
||||
output = sd_pipe(prompt, number_of_steps=2, height=64, width=64, output_type="np")
|
||||
image_shape = output.images[0].shape[:2]
|
||||
assert image_shape == [64, 64]
|
||||
|
||||
config = dict(sd_pipe.unet.config)
|
||||
config["sample_size"] = 96
|
||||
sd_pipe.unet = UNet2DConditionModel.from_config(config)
|
||||
output = sd_pipe(prompt, number_of_steps=2, output_type="np")
|
||||
image_shape = output.images[0].shape[:2]
|
||||
assert image_shape == [96, 96]
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
|
||||
@@ -157,7 +157,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
print(image_slice.flatten())
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.4935, 0.4784, 0.4802, 0.5027, 0.4805, 0.5149, 0.5143, 0.4879, 0.4731])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-3
|
||||
@@ -196,7 +196,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
|
||||
image_slice = image[-1, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (2, 128, 128, 3)
|
||||
assert image.shape == (2, 64, 64, 3)
|
||||
expected_slice = np.array([0.4939, 0.4627, 0.4831, 0.5710, 0.5387, 0.4428, 0.5230, 0.5545, 0.4586])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
|
||||
|
||||
@@ -228,7 +228,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
output_type="np",
|
||||
).images
|
||||
|
||||
assert images.shape == (1, 128, 128, 3)
|
||||
assert images.shape == (1, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt=1 (default) for batch of images
|
||||
batch_size = 2
|
||||
@@ -238,7 +238,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
output_type="np",
|
||||
).images
|
||||
|
||||
assert images.shape == (batch_size, 128, 128, 3)
|
||||
assert images.shape == (batch_size, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt for single prompt
|
||||
num_images_per_prompt = 2
|
||||
@@ -249,7 +249,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
assert images.shape == (num_images_per_prompt, 128, 128, 3)
|
||||
assert images.shape == (num_images_per_prompt, 64, 64, 3)
|
||||
|
||||
# test num_images_per_prompt for batch of prompts
|
||||
batch_size = 2
|
||||
@@ -260,7 +260,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
assert images.shape == (batch_size * num_images_per_prompt, 128, 128, 3)
|
||||
assert images.shape == (batch_size * num_images_per_prompt, 64, 64, 3)
|
||||
|
||||
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
|
||||
def test_stable_diffusion_img_variation_fp16(self):
|
||||
@@ -297,7 +297,7 @@ class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unitte
|
||||
output_type="np",
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@slow
|
||||
|
||||
@@ -212,7 +212,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5075, 0.4485, 0.4558, 0.5369, 0.5369, 0.5236, 0.5127, 0.4983, 0.4776])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -300,7 +300,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@slow
|
||||
|
||||
@@ -155,7 +155,7 @@ class SafeDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.5112, 0.4692, 0.4715, 0.5206, 0.4894, 0.5114, 0.5096, 0.4932, 0.4755])
|
||||
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -201,7 +201,7 @@ class SafeDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
expected_slice = np.array([0.4937, 0.4649, 0.4716, 0.5145, 0.4889, 0.513, 0.513, 0.4905, 0.4738])
|
||||
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
@@ -258,7 +258,7 @@ class SafeDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
|
||||
|
||||
assert image.shape == (1, 128, 128, 3)
|
||||
assert image.shape == (1, 64, 64, 3)
|
||||
|
||||
|
||||
@slow
|
||||
|
||||
@@ -436,7 +436,7 @@ class PipelineFastTests(unittest.TestCase):
|
||||
|
||||
assert image_inpaint.shape == (1, 32, 32, 3)
|
||||
assert image_img2img.shape == (1, 32, 32, 3)
|
||||
assert image_text2img.shape == (1, 128, 128, 3)
|
||||
assert image_text2img.shape == (1, 64, 64, 3)
|
||||
|
||||
def test_set_scheduler(self):
|
||||
unet = self.dummy_cond_unet
|
||||
|
||||
@@ -78,7 +78,7 @@ class FlaxPipelineTests(unittest.TestCase):
|
||||
|
||||
images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
|
||||
|
||||
assert images.shape == (num_samples, 1, 128, 128, 3)
|
||||
assert images.shape == (num_samples, 1, 64, 64, 3)
|
||||
if jax.device_count() == 8:
|
||||
assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
|
||||
assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1
|
||||
|
||||
Reference in New Issue
Block a user