From e1ef122260d015255b0a7c075fd08ed114670671 Mon Sep 17 00:00:00 2001 From: Narek Maloyan Date: Mon, 20 Jun 2022 20:11:43 +0000 Subject: [PATCH 01/10] fix alphas_cumprod --- src/diffusers/schedulers/scheduling_ddpm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 1831a88bec..bc95c0afa8 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -137,8 +137,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): return pred_prev_sample def forward_step(self, original_sample, noise, t): - sqrt_alpha_prod = self.alpha_prod_t[t] ** 0.5 - sqrt_one_minus_alpha_prod = (1 - self.alpha_prod_t[t]) ** 0.5 + sqrt_alpha_prod = self.alphas_cumprod[t] ** 0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[t]) ** 0.5 noisy_sample = sqrt_alpha_prod * original_sample + sqrt_one_minus_alpha_prod * noise return noisy_sample From ee902ddf3a7c6e56897a425f36fae48efd5926f5 Mon Sep 17 00:00:00 2001 From: Pratik Pingale Date: Tue, 21 Jun 2022 12:53:26 +0530 Subject: [PATCH 02/10] Fix: TODO checklist checkbox --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index bdd504c630..80b3f5ca60 100644 --- a/README.md +++ b/README.md @@ -278,13 +278,13 @@ wavwrite("generated_audio.wav", sampling_rate, audio.squeeze().cpu().numpy()) ## TODO -- Create common API for models [ ] -- Add tests for models [ ] -- Adapt schedulers for training [ ] -- Write google colab for training [ ] -- Write docs / Think about how to structure docs [ ] -- Add tests to circle ci [ ] -- Add [Diffusion LM models](https://arxiv.org/pdf/2205.14217.pdf) [ ] -- Add more vision models [ ] -- Add more speech models [ ] -- Add RL model [ ] +- [ ] Create common API for models +- [ ] Add tests for models +- [ ] Adapt schedulers for training +- [ ] Write google colab for training +- [ ] Write docs / Think about how to structure docs +- [ ] Add tests to circle ci +- [ ] Add [Diffusion LM models](https://arxiv.org/pdf/2205.14217.pdf) +- [ ] Add more vision models +- [ ] Add more speech models +- [ ] Add RL model From 848c86ca0a2e321cda294c3cd3e8a3572281c2f3 Mon Sep 17 00:00:00 2001 From: anton-l Date: Wed, 22 Jun 2022 13:38:14 +0200 Subject: [PATCH 03/10] batched forward diffusion step --- examples/train_unconditional.py | 16 +++++++--------- src/diffusers/schedulers/scheduling_ddpm.py | 18 +++++++++++++----- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/examples/train_unconditional.py b/examples/train_unconditional.py index fbdc0aba29..79144e9c02 100644 --- a/examples/train_unconditional.py +++ b/examples/train_unconditional.py @@ -39,7 +39,7 @@ def main(args): resamp_with_conv=True, resolution=args.resolution, ) - noise_scheduler = DDPMScheduler(timesteps=1000) + noise_scheduler = DDPMScheduler(timesteps=1000, tensor_format="pt") optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) augmentations = Compose( @@ -93,15 +93,13 @@ def main(args): pbar.set_description(f"Epoch {epoch}") for step, batch in enumerate(train_dataloader): clean_images = batch["input"] - noisy_images = torch.empty_like(clean_images) - noise_samples = torch.empty_like(clean_images) + noise_samples = torch.randn(clean_images.shape).to(clean_images.device) bsz = clean_images.shape[0] - timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_images.device).long() - for idx in range(bsz): - noise = torch.randn(clean_images.shape[1:]).to(clean_images.device) - noise_samples[idx] = noise - noisy_images[idx] = noise_scheduler.forward_step(clean_images[idx], noise, timesteps[idx]) + + # add noise onto the clean images according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_images = noise_scheduler.training_step(clean_images, noise_samples, timesteps) if step % args.gradient_accumulation_steps != 0: with accelerator.no_sync(model): @@ -146,7 +144,7 @@ def main(args): # save image test_dir = os.path.join(args.output_dir, "test_samples") os.makedirs(test_dir, exist_ok=True) - image_pil.save(f"{test_dir}/{epoch}.png") + image_pil.save(f"{test_dir}/{epoch:04d}.png") # save the model if args.push_to_hub: diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index eb85796f27..206b1477f2 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -17,6 +17,7 @@ import math import numpy as np +import torch from ..configuration_utils import ConfigMixin from .scheduling_utils import SchedulerMixin @@ -142,11 +143,18 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): return pred_prev_sample - def forward_step(self, original_sample, noise, t): - sqrt_alpha_prod = self.alphas_cumprod[t] ** 0.5 - sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[t]) ** 0.5 - noisy_sample = sqrt_alpha_prod * original_sample + sqrt_one_minus_alpha_prod * noise - return noisy_sample + def training_step(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor): + if timesteps.dim() != 1: + raise ValueError("`timesteps` must be a 1D tensor") + + device = original_samples.device + batch_size = original_samples.shape[0] + timesteps = timesteps.reshape(batch_size, 1, 1, 1) + + sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5 + sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5 + noisy_samples = sqrt_alpha_prod.to(device) * original_samples + sqrt_one_minus_alpha_prod.to(device) * noise + return noisy_samples def __len__(self): return self.config.timesteps From 33abc79515e472d3a705a43ef7405f009bba2f95 Mon Sep 17 00:00:00 2001 From: Anton Lozhkov Date: Wed, 22 Jun 2022 13:52:45 +0200 Subject: [PATCH 04/10] Update README.md --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index d3d1c1c67e..d806e852e9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -10,7 +10,7 @@ python -m torch.distributed.launch \ train_unconditional.py \ --dataset="huggan/flowers-102-categories" \ --resolution=64 \ - --output_path="flowers-ddpm" \ + --output_dir="flowers-ddpm" \ --batch_size=16 \ --num_epochs=100 \ --gradient_accumulation_steps=1 \ @@ -34,7 +34,7 @@ python -m torch.distributed.launch \ train_unconditional.py \ --dataset="huggan/pokemon" \ --resolution=64 \ - --output_path="pokemon-ddpm" \ + --output_dir="pokemon-ddpm" \ --batch_size=16 \ --num_epochs=100 \ --gradient_accumulation_steps=1 \ From d0032c6095a858c2f91166821e344aa3d71ab38b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 22 Jun 2022 12:38:36 +0000 Subject: [PATCH 05/10] refactor naming --- README.md | 4 +- scripts/conversion_glide.py | 10 ++-- src/diffusers/__init__.py | 12 +++-- src/diffusers/models/__init__.py | 2 +- src/diffusers/models/unet_glide.py | 6 +-- src/diffusers/pipelines/__init__.py | 9 ++-- src/diffusers/pipelines/grad_tts_utils.py | 22 ++------- src/diffusers/pipelines/pipeline_glide.py | 8 ++-- src/diffusers/schedulers/scheduling_ddim.py | 4 +- src/diffusers/schedulers/scheduling_ddpm.py | 4 +- src/diffusers/schedulers/scheduling_pndm.py | 2 +- src/diffusers/utils/__init__.py | 38 +++++++++++++++ ...rmers_and_inflect_and_unidecode_objects.py | 10 ++++ .../utils/dummy_transformers_objects.py | 11 ++--- tests/test_modeling_utils.py | 46 ++++++++++++------- utils/check_dummies.py | 5 +- 16 files changed, 122 insertions(+), 71 deletions(-) create mode 100644 src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py diff --git a/README.md b/README.md index 80b3f5ca60..f6889baf92 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ The class provides functionality to compute previous image according to alpha, b **Diffusion Pipeline**: End-to-end pipeline that includes multiple diffusion models, possible text encoders, ... -*Examples*: GLIDE, Latent-Diffusion, Imagen, DALL-E 2 +*Examples*: Glide, Latent-Diffusion, Imagen, DALL-E 2

@@ -190,7 +190,7 @@ image_pil.save("test.png") [Diffuser](https://diffusion-planning.github.io/) for planning in reinforcement learning: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1TmBmlYeKUZSkUZoJqfBmaicVTKx6nN1R?usp=sharing) -### 2. `diffusers` as a collection of popular Diffusion systems (GLIDE, Dalle, ...) +### 2. `diffusers` as a collection of popular Diffusion systems (Glide, Dalle, ...) For more examples see [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines). diff --git a/scripts/conversion_glide.py b/scripts/conversion_glide.py index 2d04580e76..6cf0133db6 100644 --- a/scripts/conversion_glide.py +++ b/scripts/conversion_glide.py @@ -1,8 +1,8 @@ import torch from torch import nn -from diffusers import ClassifierFreeGuidanceScheduler, DDIMScheduler, GLIDESuperResUNetModel, GLIDETextToImageUNetModel -from diffusers.pipelines.pipeline_glide import GLIDE, CLIPTextModel +from diffusers import ClassifierFreeGuidanceScheduler, DDIMScheduler, GlideSuperResUNetModel, GlideTextToImageUNetModel +from diffusers.pipelines.pipeline_glide import Glide, CLIPTextModel from transformers import CLIPTextConfig, GPT2Tokenizer @@ -55,7 +55,7 @@ for layer_idx in range(config.num_hidden_layers): ### Convert the Text-to-Image UNet -text2im_model = GLIDETextToImageUNetModel( +text2im_model = GlideTextToImageUNetModel( in_channels=3, model_channels=192, out_channels=6, @@ -80,7 +80,7 @@ text_scheduler = ClassifierFreeGuidanceScheduler(timesteps=1000, beta_schedule=" # wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/upsample.pt ups_state_dict = torch.load("upsample.pt", map_location="cpu") -superres_model = GLIDESuperResUNetModel( +superres_model = GlideSuperResUNetModel( in_channels=6, model_channels=192, out_channels=6, @@ -101,7 +101,7 @@ upscale_scheduler = DDIMScheduler( timesteps=1000, beta_schedule="linear", beta_start=0.0001, beta_end=0.02, tensor_format="pt" ) -glide = GLIDE( +glide = Glide( text_unet=text2im_model, text_noise_scheduler=text_scheduler, text_encoder=model, diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 881d48240e..efb89e8597 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,7 +1,7 @@ # flake8: noqa # There's no way to ignore "F401 '...' imported but unused" warnings in this # module, but to preserve other warnings. So, don't check this module at all. -from .utils import is_transformers_available +from .utils import is_inflect_available, is_transformers_available, is_unidecode_available __version__ = "0.0.4" @@ -16,8 +16,14 @@ from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMSche if is_transformers_available(): - from .models.unet_glide import GLIDESuperResUNetModel, GLIDETextToImageUNetModel, GLIDEUNetModel + from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel from .models.unet_grad_tts import UNetGradTTSModel - from .pipelines import GLIDE, GradTTS, LatentDiffusion + from .pipelines import Glide, LatentDiffusion else: from .utils.dummy_transformers_objects import * + + +if is_transformers_available() and is_inflect_available() and is_unidecode_available(): + from .pipelines import GradTTS +else: + from .utils.dummy_transformers_and_inflect_and_unidecode_objects import * diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index 20fb9bfeba..3f0c78b3c6 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -17,7 +17,7 @@ # limitations under the License. from .unet import UNetModel -from .unet_glide import GLIDESuperResUNetModel, GLIDETextToImageUNetModel, GLIDEUNetModel +from .unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel from .unet_grad_tts import UNetGradTTSModel from .unet_ldm import UNetLDMModel from .unet_rl import TemporalUNet diff --git a/src/diffusers/models/unet_glide.py b/src/diffusers/models/unet_glide.py index abbd7dae12..648ff9c34a 100644 --- a/src/diffusers/models/unet_glide.py +++ b/src/diffusers/models/unet_glide.py @@ -388,7 +388,7 @@ class QKVAttention(nn.Module): return a.reshape(bs, -1, length) -class GLIDEUNetModel(ModelMixin, ConfigMixin): +class GlideUNetModel(ModelMixin, ConfigMixin): """ The full UNet model with attention and timestep embedding. @@ -641,7 +641,7 @@ class GLIDEUNetModel(ModelMixin, ConfigMixin): return self.out(h) -class GLIDETextToImageUNetModel(GLIDEUNetModel): +class GlideTextToImageUNetModel(GlideUNetModel): """ A UNetModel that performs super-resolution. @@ -734,7 +734,7 @@ class GLIDETextToImageUNetModel(GLIDEUNetModel): return self.out(h) -class GLIDESuperResUNetModel(GLIDEUNetModel): +class GlideSuperResUNetModel(GlideUNetModel): """ A UNetModel that performs super-resolution. diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index e6e753e8e2..7ba126b03b 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,4 +1,4 @@ -from ..utils import is_transformers_available +from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available from .pipeline_bddm import BDDM from .pipeline_ddim import DDIM from .pipeline_ddpm import DDPM @@ -6,6 +6,9 @@ from .pipeline_pndm import PNDM if is_transformers_available(): - from .pipeline_glide import GLIDE - from .pipeline_grad_tts import GradTTS + from .pipeline_glide import Glide from .pipeline_latent_diffusion import LatentDiffusion + + +if is_transformers_available() and is_unidecode_available() and is_inflect_available(): + from .pipeline_grad_tts import GradTTS diff --git a/src/diffusers/pipelines/grad_tts_utils.py b/src/diffusers/pipelines/grad_tts_utils.py index d0d6b89ce8..15995b85c8 100644 --- a/src/diffusers/pipelines/grad_tts_utils.py +++ b/src/diffusers/pipelines/grad_tts_utils.py @@ -6,20 +6,9 @@ from shutil import copyfile import torch +import inflect from transformers import PreTrainedTokenizer - - -try: - from unidecode import unidecode -except: - print("unidecode is not installed") - pass - -try: - import inflect -except: - print("inflect is not installed") - pass +from unidecode import unidecode valid_symbols = [ @@ -234,12 +223,7 @@ def english_cleaners(text): return text -try: - _inflect = inflect.engine() -except: - print("inflect is not installed") - _inflect = None - +_inflect = inflect.engine() _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)") diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py index 07603e153e..0046055349 100644 --- a/src/diffusers/pipelines/pipeline_glide.py +++ b/src/diffusers/pipelines/pipeline_glide.py @@ -30,7 +30,7 @@ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPo from transformers.modeling_utils import PreTrainedModel from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings -from ..models import GLIDESuperResUNetModel, GLIDETextToImageUNetModel +from ..models import GlideSuperResUNetModel, GlideTextToImageUNetModel from ..pipeline_utils import DiffusionPipeline from ..schedulers import DDIMScheduler, DDPMScheduler from ..utils import logging @@ -711,14 +711,14 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): return res + torch.zeros(broadcast_shape, device=timesteps.device) -class GLIDE(DiffusionPipeline): +class Glide(DiffusionPipeline): def __init__( self, - text_unet: GLIDETextToImageUNetModel, + text_unet: GlideTextToImageUNetModel, text_noise_scheduler: DDPMScheduler, text_encoder: CLIPTextModel, tokenizer: GPT2Tokenizer, - upscale_unet: GLIDESuperResUNetModel, + upscale_unet: GlideSuperResUNetModel, upscale_noise_scheduler: DDIMScheduler, ): super().__init__() diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 1038db2876..d11af4ec25 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -73,7 +73,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): if beta_schedule == "linear": self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32) elif beta_schedule == "squaredcos_cap_v2": - # GLIDE cosine schedule + # Glide cosine schedule self.betas = betas_for_alpha_bar(timesteps) else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") @@ -132,7 +132,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): std_dev_t = eta * variance ** (0.5) if use_clipped_residual: - # the residual is always re-derived from the clipped x_0 in GLIDE + # the residual is always re-derived from the clipped x_0 in Glide residual = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index eb85796f27..d6f91c918f 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -76,7 +76,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): elif beta_schedule == "linear": self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32) elif beta_schedule == "squaredcos_cap_v2": - # GLIDE cosine schedule + # Glide cosine schedule self.betas = betas_for_alpha_bar(timesteps) else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") @@ -108,7 +108,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): elif variance_type == "fixed_large": variance = self.betas[t] elif variance_type == "fixed_large_log": - # GLIDE max_log + # Glide max_log variance = self.log(self.betas[t]) return variance diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index 10679f5c6b..e7479d5497 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -66,7 +66,7 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): if beta_schedule == "linear": self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32) elif beta_schedule == "squaredcos_cap_v2": - # GLIDE cosine schedule + # Glide cosine schedule self.betas = betas_for_alpha_bar(timesteps) else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index c5a6e223d7..470526a8b5 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -45,10 +45,34 @@ except importlib_metadata.PackageNotFoundError: _transformers_available = False +_inflect_available = importlib.util.find_spec("inflect") is not None +try: + _inflect_version = importlib_metadata.version("inflect") + logger.debug(f"Successfully imported inflect version {_inflect_version}") +except importlib_metadata.PackageNotFoundError: + _inflect_available = False + + +_unidecode_available = importlib.util.find_spec("unidecode") is not None +try: + _unidecode_version = importlib_metadata.version("unidecode") + logger.debug(f"Successfully imported unidecode version {_unidecode_version}") +except importlib_metadata.PackageNotFoundError: + _unidecode_available = False + + def is_transformers_available(): return _transformers_available +def is_inflect_available(): + return _inflect_available + + +def is_unidecode_available(): + return _unidecode_available + + class RepositoryNotFoundError(HTTPError): """ Raised when trying to access a hf.co URL with an invalid repository name, or with a private repo name the user does @@ -70,9 +94,23 @@ TRANSFORMERS_IMPORT_ERROR = """ """ +UNIDECODE_IMPORT_ERROR = """ +{0} requires the unidecode library but it was not found in your environment. You can install it with pip: +`pip install Unidecode` +""" + + +INFLECT_IMPORT_ERROR = """ +{0} requires the inflect library but it was not found in your environment. You can install it with pip: +`pip install inflect` +""" + + BACKENDS_MAPPING = OrderedDict( [ ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)), + ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)), + ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)), ] ) diff --git a/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py new file mode 100644 index 0000000000..320a93134a --- /dev/null +++ b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py @@ -0,0 +1,10 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +# flake8: noqa +from ..utils import DummyObject, requires_backends + + +class GradTTS(metaclass=DummyObject): + _backends = ["transformers", "inflect", "unidecode"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["transformers", "inflect", "unidecode"]) diff --git a/src/diffusers/utils/dummy_transformers_objects.py b/src/diffusers/utils/dummy_transformers_objects.py index 6466df193e..1efb17297f 100644 --- a/src/diffusers/utils/dummy_transformers_objects.py +++ b/src/diffusers/utils/dummy_transformers_objects.py @@ -3,21 +3,21 @@ from ..utils import DummyObject, requires_backends -class GLIDESuperResUNetModel(metaclass=DummyObject): +class GlideSuperResUNetModel(metaclass=DummyObject): _backends = ["transformers"] def __init__(self, *args, **kwargs): requires_backends(self, ["transformers"]) -class GLIDETextToImageUNetModel(metaclass=DummyObject): +class GlideTextToImageUNetModel(metaclass=DummyObject): _backends = ["transformers"] def __init__(self, *args, **kwargs): requires_backends(self, ["transformers"]) -class GLIDEUNetModel(metaclass=DummyObject): +class GlideUNetModel(metaclass=DummyObject): _backends = ["transformers"] def __init__(self, *args, **kwargs): @@ -31,10 +31,7 @@ class UNetGradTTSModel(metaclass=DummyObject): requires_backends(self, ["transformers"]) -GLIDE = None - - -class GradTTS(metaclass=DummyObject): +class Glide(metaclass=DummyObject): _backends = ["transformers"] def __init__(self, *args, **kwargs): diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index a58759b297..372435de9d 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -21,17 +21,17 @@ import unittest import numpy as np import torch -import pytest from diffusers import ( BDDM, DDIM, DDPM, - GLIDE, + Glide, PNDM, DDIMScheduler, DDPMScheduler, - GLIDESuperResUNetModel, - GLIDETextToImageUNetModel, + GlideSuperResUNetModel, + GlideTextToImageUNetModel, + GradTTS, LatentDiffusion, PNDMScheduler, UNetGradTTSModel, @@ -247,13 +247,13 @@ class UnetModelTests(ModelTesterMixin, unittest.TestCase): output_slice = output[0, -1, -3:, -3:].flatten() # fmt: off - expected_output_slice = torch.tensor([ 0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053]) + expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053]) # fmt: on self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) -class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase): - model_class = GLIDESuperResUNetModel +class GlideSuperResUNetTests(ModelTesterMixin, unittest.TestCase): + model_class = GlideSuperResUNetModel @property def dummy_input(self): @@ -309,7 +309,7 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase): self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_from_pretrained_hub(self): - model, loading_info = GLIDESuperResUNetModel.from_pretrained( + model, loading_info = GlideSuperResUNetModel.from_pretrained( "fusing/glide-super-res-dummy", output_loading_info=True ) self.assertIsNotNone(model) @@ -321,7 +321,7 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase): assert image is not None, "Make sure output is not None" def test_output_pretrained(self): - model = GLIDESuperResUNetModel.from_pretrained("fusing/glide-super-res-dummy") + model = GlideSuperResUNetModel.from_pretrained("fusing/glide-super-res-dummy") torch.manual_seed(0) if torch.cuda.is_available(): @@ -342,8 +342,8 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase): self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) -class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): - model_class = GLIDETextToImageUNetModel +class GlideTextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): + model_class = GlideTextToImageUNetModel @property def dummy_input(self): @@ -401,7 +401,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") def test_from_pretrained_hub(self): - model, loading_info = GLIDETextToImageUNetModel.from_pretrained( + model, loading_info = GlideTextToImageUNetModel.from_pretrained( "fusing/unet-glide-text2im-dummy", output_loading_info=True ) self.assertIsNotNone(model) @@ -413,7 +413,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): assert image is not None, "Make sure output is not None" def test_output_pretrained(self): - model = GLIDETextToImageUNetModel.from_pretrained("fusing/unet-glide-text2im-dummy") + model = GlideTextToImageUNetModel.from_pretrained("fusing/unet-glide-text2im-dummy") torch.manual_seed(0) if torch.cuda.is_available(): @@ -431,7 +431,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase): output, _ = torch.split(output, 3, dim=1) output_slice = output[0, -1, -3:, -3:].flatten() # fmt: off - expected_output_slice = torch.tensor([ 2.7766, -10.3558, -14.9149, -0.9376, -14.9175, -17.7679, -5.5565, -12.9521, -12.9845]) + expected_output_slice = torch.tensor([2.7766, -10.3558, -14.9149, -0.9376, -14.9175, -17.7679, -5.5565, -12.9521, -12.9845]) # fmt: on self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) @@ -571,7 +571,7 @@ class UNetGradTTSModelTests(ModelTesterMixin, unittest.TestCase): output_slice = output[0, -3:, -3:].flatten() # fmt: off - expected_output_slice = torch.tensor([-0.0690, -0.0531, 0.0633, -0.0660, -0.0541, 0.0650, -0.0656, -0.0555, 0.0617]) + expected_output_slice = torch.tensor([-0.0690, -0.0531, 0.0633, -0.0660, -0.0541, 0.0650, -0.0656, -0.0555, 0.0617]) # fmt: on self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) @@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_glide_text2img(self): model_id = "fusing/glide-base" - glide = GLIDE.from_pretrained(model_id) + glide = Glide.from_pretrained(model_id) prompt = "a pencil sketch of a corgi" generator = torch.manual_seed(0) @@ -701,6 +701,20 @@ class PipelineTesterMixin(unittest.TestCase): expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2 + @slow + def test_grad_tts(self): + model_id = "fusing/grad-tts-libri-tts" + grad_tts = GradTTS.from_pretrained(model_id) + + text = "Hello world, I missed you so much." + + # generate mel spectograms using text + mel_spec = grad_tts(text) + + assert mel_spec.shape == (1, 256, 256, 3) + expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) + assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2 + def test_module_from_pipeline(self): model = DiffWave(num_res_layers=4) noise_scheduler = DDPMScheduler(timesteps=12) diff --git a/utils/check_dummies.py b/utils/check_dummies.py index e132b3499c..f9a45284f3 100644 --- a/utils/check_dummies.py +++ b/utils/check_dummies.py @@ -23,10 +23,9 @@ import re PATH_TO_DIFFUSERS = "src/diffusers" # Matches is_xxx_available() -_re_backend = re.compile(r"if is\_([a-z_]*)_available\(\)") +_re_backend = re.compile(r"is\_([a-z_]*)_available\(\)") # Matches from xxx import bla _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n") -_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z]*\_available\(\)") DUMMY_CONSTANT = """ @@ -54,7 +53,7 @@ def find_backend(line): if len(backends) == 0: return None - return backends[0] + return "_and_".join(backends) def read_init(): From 7b43035bcb2ff788ce10fb64c81e61bd4bcf6433 Mon Sep 17 00:00:00 2001 From: anton-l Date: Wed, 22 Jun 2022 15:15:54 +0200 Subject: [PATCH 06/10] init text2im script --- examples/train_latent_text_to_image.py | 202 ++++++++++++++++++++++ src/diffusers/pipelines/pipeline_glide.py | 16 -- 2 files changed, 202 insertions(+), 16 deletions(-) create mode 100644 examples/train_latent_text_to_image.py diff --git a/examples/train_latent_text_to_image.py b/examples/train_latent_text_to_image.py new file mode 100644 index 0000000000..fd823fdad9 --- /dev/null +++ b/examples/train_latent_text_to_image.py @@ -0,0 +1,202 @@ +import argparse +import os + +import torch +import torch.nn.functional as F + +import PIL.Image +from accelerate import Accelerator +from datasets import load_dataset +from diffusers import DDPM, DDPMScheduler, UNetLDMModel +from diffusers.hub_utils import init_git_repo, push_to_hub +from diffusers.modeling_utils import unwrap_model +from diffusers.optimization import get_scheduler +from diffusers.utils import logging +from torchvision.transforms import ( + CenterCrop, + Compose, + InterpolationMode, + Lambda, + RandomHorizontalFlip, + Resize, + ToTensor, +) +from tqdm.auto import tqdm + + +logger = logging.get_logger(__name__) + + +def main(args): + accelerator = Accelerator(mixed_precision=args.mixed_precision) + + model = UNetLDMModel( + attention_resolutions=[4, 2, 1], + channel_mult=[1, 2, 4, 4], + context_dim=1280, + conv_resample=True, + dims=2, + dropout=0, + image_size=32, + in_channels=4, + model_channels=320, + num_heads=8, + num_res_blocks=2, + out_channels=4, + resblock_updown=False, + transformer_depth=1, + use_new_attention_order=False, + use_scale_shift_norm=False, + use_spatial_transformer=True, + legacy=False, + ) + noise_scheduler = DDPMScheduler(timesteps=1000, tensor_format="pt") + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) + + augmentations = Compose( + [ + Resize(args.resolution, interpolation=InterpolationMode.BILINEAR), + CenterCrop(args.resolution), + RandomHorizontalFlip(), + ToTensor(), + Lambda(lambda x: x * 2 - 1), + ] + ) + dataset = load_dataset(args.dataset, split="train") + + def transforms(examples): + images = [augmentations(image.convert("RGB")) for image in examples["image"]] + return {"input": images} + + dataset.set_transform(transforms) + train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True) + + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=args.warmup_steps, + num_training_steps=(len(train_dataloader) * args.num_epochs) // args.gradient_accumulation_steps, + ) + + model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler + ) + + if args.push_to_hub: + repo = init_git_repo(args, at_init=True) + + # Train! + is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized() + world_size = torch.distributed.get_world_size() if is_distributed else 1 + total_train_batch_size = args.batch_size * args.gradient_accumulation_steps * world_size + max_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_epochs + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(train_dataloader.dataset)}") + logger.info(f" Num Epochs = {args.num_epochs}") + logger.info(f" Instantaneous batch size per device = {args.batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {max_steps}") + + for epoch in range(args.num_epochs): + model.train() + with tqdm(total=len(train_dataloader), unit="ba") as pbar: + pbar.set_description(f"Epoch {epoch}") + for step, batch in enumerate(train_dataloader): + clean_images = batch["input"] + noise_samples = torch.randn(clean_images.shape).to(clean_images.device) + bsz = clean_images.shape[0] + timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_images.device).long() + + # add noise onto the clean images according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_images = noise_scheduler.training_step(clean_images, noise_samples, timesteps) + + if step % args.gradient_accumulation_steps != 0: + with accelerator.no_sync(model): + output = model(noisy_images, timesteps) + # predict the noise residual + loss = F.mse_loss(output, noise_samples) + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + else: + output = model(noisy_images, timesteps) + # predict the noise residual + loss = F.mse_loss(output, noise_samples) + loss = loss / args.gradient_accumulation_steps + accelerator.backward(loss) + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + pbar.update(1) + pbar.set_postfix(loss=loss.detach().item(), lr=optimizer.param_groups[0]["lr"]) + + optimizer.step() + if is_distributed: + torch.distributed.barrier() + + # Generate a sample image for visual inspection + if args.local_rank in [-1, 0]: + model.eval() + with torch.no_grad(): + pipeline = DDPM(unet=unwrap_model(model), noise_scheduler=noise_scheduler) + + generator = torch.manual_seed(0) + # run pipeline in inference (sample random noise and denoise) + image = pipeline(generator=generator) + + # process image to PIL + image_processed = image.cpu().permute(0, 2, 3, 1) + image_processed = (image_processed + 1.0) * 127.5 + image_processed = image_processed.type(torch.uint8).numpy() + image_pil = PIL.Image.fromarray(image_processed[0]) + + # save image + test_dir = os.path.join(args.output_dir, "test_samples") + os.makedirs(test_dir, exist_ok=True) + image_pil.save(f"{test_dir}/{epoch:04d}.png") + + # save the model + if args.push_to_hub: + push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False) + else: + pipeline.save_pretrained(args.output_dir) + if is_distributed: + torch.distributed.barrier() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument("--local_rank", type=int, default=-1) + parser.add_argument("--dataset", type=str, default="huggan/flowers-102-categories") + parser.add_argument("--output_dir", type=str, default="ddpm-model") + parser.add_argument("--overwrite_output_dir", action="store_true") + parser.add_argument("--resolution", type=int, default=64) + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--num_epochs", type=int, default=100) + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--lr", type=float, default=1e-4) + parser.add_argument("--warmup_steps", type=int, default=500) + parser.add_argument("--push_to_hub", action="store_true") + parser.add_argument("--hub_token", type=str, default=None) + parser.add_argument("--hub_model_id", type=str, default=None) + parser.add_argument("--hub_private_repo", action="store_true") + parser.add_argument( + "--mixed_precision", + type=str, + default="no", + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose" + "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." + "and an Nvidia Ampere GPU." + ), + ) + + args = parser.parse_args() + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + main(args) diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py index 07603e153e..d3706b74e5 100644 --- a/src/diffusers/pipelines/pipeline_glide.py +++ b/src/diffusers/pipelines/pipeline_glide.py @@ -695,22 +695,6 @@ class CLIPTextModel(CLIPPreTrainedModel): ##################### -def _extract_into_tensor(arr, timesteps, broadcast_shape): - """ - Extract values from a 1-D numpy array for a batch of indices. - - :param arr: the 1-D numpy array. - :param timesteps: a tensor of indices into the array to extract. - :param broadcast_shape: a larger shape of K dimensions with the batch - dimension equal to the length of timesteps. - :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. - """ - res = torch.from_numpy(arr).to(device=timesteps.device)[timesteps].float() - while len(res.shape) < len(broadcast_shape): - res = res[..., None] - return res + torch.zeros(broadcast_shape, device=timesteps.device) - - class GLIDE(DiffusionPipeline): def __init__( self, From 48269070d23ad8a4c6f31bc6847c358aac182ad1 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 22 Jun 2022 13:40:08 +0000 Subject: [PATCH 07/10] more fixes --- README.md | 8 ++-- src/diffusers/__init__.py | 6 +-- src/diffusers/pipeline_utils.py | 26 ++-------- src/diffusers/pipelines/README.md | 2 +- src/diffusers/pipelines/__init__.py | 14 +++--- src/diffusers/pipelines/pipeline_bddm.py | 2 +- src/diffusers/pipelines/pipeline_ddim.py | 2 +- src/diffusers/pipelines/pipeline_ddpm.py | 2 +- src/diffusers/pipelines/pipeline_glide.py | 2 +- src/diffusers/pipelines/pipeline_grad_tts.py | 13 +++-- .../pipelines/pipeline_latent_diffusion.py | 2 +- src/diffusers/pipelines/pipeline_pndm.py | 2 +- tests/test_modeling_utils.py | 47 ++++++++++--------- 13 files changed, 59 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index f6889baf92..32dc7c8229 100644 --- a/README.md +++ b/README.md @@ -249,24 +249,24 @@ image_pil = PIL.Image.fromarray(image_processed[0]) image_pil.save("test.png") ``` -#### **Text to speech with GradTTS and BDDM** +#### **Text to speech with GradTTS and BDDMPipeline** ```python import torch -from diffusers import BDDM, GradTTS +from diffusers import BDDMPipeline, GradTTS torch_device = "cuda" # load grad tts and bddm pipelines grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts") -bddm = BDDM.from_pretrained("fusing/diffwave-vocoder-ljspeech") +bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech") text = "Hello world, I missed you so much." # generate mel spectograms using text mel_spec = grad_tts(text, torch_device=torch_device) -# generate the speech by passing mel spectograms to BDDM pipeline +# generate the speech by passing mel spectograms to BDDMPipeline pipeline generator = torch.manual_seed(42) audio = bddm(mel_spec, generator, torch_device=torch_device) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index efb89e8597..aaca3d347b 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -11,19 +11,19 @@ from .models.unet import UNetModel from .models.unet_ldm import UNetLDMModel from .models.unet_rl import TemporalUNet from .pipeline_utils import DiffusionPipeline -from .pipelines import BDDM, DDIM, DDPM, PNDM +from .pipelines import BDDMPipeline, DDIMPipeline, DDPMPipeline, PNDMPipeline from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMScheduler, SchedulerMixin if is_transformers_available(): from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel from .models.unet_grad_tts import UNetGradTTSModel - from .pipelines import Glide, LatentDiffusion + from .pipelines import GlidePipeline, LatentDiffusionPipeline else: from .utils.dummy_transformers_objects import * if is_transformers_available() and is_inflect_available() and is_unidecode_available(): - from .pipelines import GradTTS + from .pipelines import GradTTSPipeline else: from .utils.dummy_transformers_and_inflect_and_unidecode_objects import * diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index d8a2644dc9..339ebb074a 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -21,7 +21,6 @@ from typing import Optional, Union from huggingface_hub import snapshot_download from .configuration_utils import ConfigMixin -from .dynamic_modules_utils import get_class_from_dynamic_module from .utils import DIFFUSERS_CACHE, logging @@ -81,9 +80,6 @@ class DiffusionPipeline(ConfigMixin): # set models setattr(self, name, module) - register_dict = {"_module": self.__module__.split(".")[-1]} - self.register_to_config(**register_dict) - def save_pretrained(self, save_directory: Union[str, os.PathLike]): self.save_config(save_directory) @@ -139,11 +135,7 @@ class DiffusionPipeline(ConfigMixin): config_dict = cls.get_config_dict(cached_folder) - # 2. Get class name and module candidates to load custom models - module_candidate_name = config_dict["_module"] - module_candidate = module_candidate_name + ".py" - - # 3. Load the pipeline class, if using custom module then load it from the hub + # 2. Load the pipeline class, if using custom module then load it from the hub # if we load from explicit class, let's use it if cls != DiffusionPipeline: pipeline_class = cls @@ -151,11 +143,6 @@ class DiffusionPipeline(ConfigMixin): diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) - # (TODO - we should allow to load custom pipelines - # else we need to load the correct module from the Hub - # module = module_candidate - # pipeline_class = get_class_from_dynamic_module(cached_folder, module, class_name_, cached_folder) - init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) init_kwargs = {} @@ -163,7 +150,7 @@ class DiffusionPipeline(ConfigMixin): # import it here to avoid circular import from diffusers import pipelines - # 4. Load each module in the pipeline + # 3. Load each module in the pipeline for name, (library_name, class_name) in init_dict.items(): is_pipeline_module = hasattr(pipelines, library_name) # if the model is in a pipeline module, then we load it from the pipeline @@ -171,14 +158,7 @@ class DiffusionPipeline(ConfigMixin): pipeline_module = getattr(pipelines, library_name) class_obj = getattr(pipeline_module, class_name) importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()} - elif library_name == module_candidate_name: - # if the model is not in diffusers or transformers, we need to load it from the hub - # assumes that it's a subclass of ModelMixin - class_obj = get_class_from_dynamic_module(cached_folder, module_candidate, class_name, cached_folder) - # since it's not from a library, we need to check class candidates for all importable classes - importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()} + class_candidates = {c: class_obj for c in importable_classes.keys()} else: # else we just import it from the library. library = importlib.import_module(library_name) diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md index 61e653a80f..c0558d35b9 100644 --- a/src/diffusers/pipelines/README.md +++ b/src/diffusers/pipelines/README.md @@ -15,5 +15,5 @@ TODO(Patrick, Anton, Suraj) - PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py). - Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py). - Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py). -- BDDM for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py). +- BDDMPipeline for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py). - Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py). diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 7ba126b03b..d26c5fc8a7 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,14 +1,14 @@ from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available -from .pipeline_bddm import BDDM -from .pipeline_ddim import DDIM -from .pipeline_ddpm import DDPM -from .pipeline_pndm import PNDM +from .pipeline_bddm import BDDMPipeline +from .pipeline_ddim import DDIMPipeline +from .pipeline_ddpm import DDPMPipeline +from .pipeline_pndm import PNDMPipeline if is_transformers_available(): - from .pipeline_glide import Glide - from .pipeline_latent_diffusion import LatentDiffusion + from .pipeline_glide import GlidePipeline + from .pipeline_latent_diffusion import LatentDiffusionPipeline if is_transformers_available() and is_unidecode_available() and is_inflect_available(): - from .pipeline_grad_tts import GradTTS + from .pipeline_grad_tts import GradTTSPipeline diff --git a/src/diffusers/pipelines/pipeline_bddm.py b/src/diffusers/pipelines/pipeline_bddm.py index 3ca79c3dee..8b24cb9ceb 100644 --- a/src/diffusers/pipelines/pipeline_bddm.py +++ b/src/diffusers/pipelines/pipeline_bddm.py @@ -271,7 +271,7 @@ class DiffWave(ModelMixin, ConfigMixin): return self.final_conv(x) -class BDDM(DiffusionPipeline): +class BDDMPipeline(DiffusionPipeline): def __init__(self, diffwave, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_ddim.py b/src/diffusers/pipelines/pipeline_ddim.py index 272d3edb6b..8da24dbf8f 100644 --- a/src/diffusers/pipelines/pipeline_ddim.py +++ b/src/diffusers/pipelines/pipeline_ddim.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class DDIM(DiffusionPipeline): +class DDIMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_ddpm.py b/src/diffusers/pipelines/pipeline_ddpm.py index ebcce77337..9cf83bfb75 100644 --- a/src/diffusers/pipelines/pipeline_ddpm.py +++ b/src/diffusers/pipelines/pipeline_ddpm.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class DDPM(DiffusionPipeline): +class DDPMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py index 0046055349..8680b7542a 100644 --- a/src/diffusers/pipelines/pipeline_glide.py +++ b/src/diffusers/pipelines/pipeline_glide.py @@ -711,7 +711,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): return res + torch.zeros(broadcast_shape, device=timesteps.device) -class Glide(DiffusionPipeline): +class GlidePipeline(DiffusionPipeline): def __init__( self, text_unet: GlideTextToImageUNetModel, diff --git a/src/diffusers/pipelines/pipeline_grad_tts.py b/src/diffusers/pipelines/pipeline_grad_tts.py index 4201124923..51c861a262 100644 --- a/src/diffusers/pipelines/pipeline_grad_tts.py +++ b/src/diffusers/pipelines/pipeline_grad_tts.py @@ -420,7 +420,7 @@ class TextEncoder(ModelMixin, ConfigMixin): return mu, logw, x_mask -class GradTTS(DiffusionPipeline): +class GradTTSPipeline(DiffusionPipeline): def __init__(self, unet, text_encoder, noise_scheduler, tokenizer): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") @@ -430,7 +430,14 @@ class GradTTS(DiffusionPipeline): @torch.no_grad() def __call__( - self, text, num_inference_steps=50, temperature=1.3, length_scale=0.91, speaker_id=15, torch_device=None + self, + text, + num_inference_steps=50, + temperature=1.3, + length_scale=0.91, + speaker_id=15, + torch_device=None, + generator=None, ): if torch_device is None: torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -464,7 +471,7 @@ class GradTTS(DiffusionPipeline): mu_y = mu_y.transpose(1, 2) # Sample latent representation from terminal distribution N(mu_y, I) - z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature + z = mu_y + torch.randn(mu_y.shape, device=mu_y.device, generator=generator) / temperature xt = z * y_mask h = 1.0 / num_inference_steps diff --git a/src/diffusers/pipelines/pipeline_latent_diffusion.py b/src/diffusers/pipelines/pipeline_latent_diffusion.py index cd7f653bf4..7d386765d4 100644 --- a/src/diffusers/pipelines/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py @@ -860,7 +860,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin): return dec, posterior -class LatentDiffusion(DiffusionPipeline): +class LatentDiffusionPipeline(DiffusionPipeline): def __init__(self, vqvae, bert, tokenizer, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_pndm.py b/src/diffusers/pipelines/pipeline_pndm.py index a19f933ed1..5fd8a98483 100644 --- a/src/diffusers/pipelines/pipeline_pndm.py +++ b/src/diffusers/pipelines/pipeline_pndm.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class PNDM(DiffusionPipeline): +class PNDMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 372435de9d..720e68741f 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -22,17 +22,17 @@ import numpy as np import torch from diffusers import ( - BDDM, - DDIM, - DDPM, - Glide, - PNDM, + BDDMPipeline, + DDIMPipeline, DDIMScheduler, + DDPMPipeline, DDPMScheduler, + GlidePipeline, GlideSuperResUNetModel, GlideTextToImageUNetModel, - GradTTS, - LatentDiffusion, + GradTTSPipeline, + LatentDiffusionPipeline, + PNDMPipeline, PNDMScheduler, UNetGradTTSModel, UNetLDMModel, @@ -583,11 +583,11 @@ class PipelineTesterMixin(unittest.TestCase): model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32) schedular = DDPMScheduler(timesteps=10) - ddpm = DDPM(model, schedular) + ddpm = DDPMPipeline(model, schedular) with tempfile.TemporaryDirectory() as tmpdirname: ddpm.save_pretrained(tmpdirname) - new_ddpm = DDPM.from_pretrained(tmpdirname) + new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) generator = torch.manual_seed(0) @@ -601,7 +601,7 @@ class PipelineTesterMixin(unittest.TestCase): def test_from_pretrained_hub(self): model_path = "fusing/ddpm-cifar10" - ddpm = DDPM.from_pretrained(model_path) + ddpm = DDPMPipeline.from_pretrained(model_path) ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path) ddpm.noise_scheduler.num_timesteps = 10 @@ -624,7 +624,7 @@ class PipelineTesterMixin(unittest.TestCase): noise_scheduler = DDPMScheduler.from_config(model_id) noise_scheduler = noise_scheduler.set_format("pt") - ddpm = DDPM(unet=unet, noise_scheduler=noise_scheduler) + ddpm = DDPMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = ddpm(generator=generator) image_slice = image[0, -1, -3:, -3:].cpu() @@ -641,7 +641,7 @@ class PipelineTesterMixin(unittest.TestCase): unet = UNetModel.from_pretrained(model_id) noise_scheduler = DDIMScheduler(tensor_format="pt") - ddim = DDIM(unet=unet, noise_scheduler=noise_scheduler) + ddim = DDIMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = ddim(generator=generator, eta=0.0) image_slice = image[0, -1, -3:, -3:].cpu() @@ -660,7 +660,7 @@ class PipelineTesterMixin(unittest.TestCase): unet = UNetModel.from_pretrained(model_id) noise_scheduler = PNDMScheduler(tensor_format="pt") - pndm = PNDM(unet=unet, noise_scheduler=noise_scheduler) + pndm = PNDMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = pndm(generator=generator) image_slice = image[0, -1, -3:, -3:].cpu() @@ -674,7 +674,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ldm_text2img(self): model_id = "fusing/latent-diffusion-text2im-large" - ldm = LatentDiffusion.from_pretrained(model_id) + ldm = LatentDiffusionPipeline.from_pretrained(model_id) prompt = "A painting of a squirrel eating a burger" generator = torch.manual_seed(0) @@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_glide_text2img(self): model_id = "fusing/glide-base" - glide = Glide.from_pretrained(model_id) + glide = GlidePipeline.from_pretrained(model_id) prompt = "a pencil sketch of a corgi" generator = torch.manual_seed(0) @@ -704,22 +704,25 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_grad_tts(self): model_id = "fusing/grad-tts-libri-tts" - grad_tts = GradTTS.from_pretrained(model_id) + grad_tts = GradTTSPipeline.from_pretrained(model_id) text = "Hello world, I missed you so much." + generator = torch.manual_seed(0) # generate mel spectograms using text - mel_spec = grad_tts(text) + mel_spec = grad_tts(text, generator=generator) - assert mel_spec.shape == (1, 256, 256, 3) - expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) - assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2 + assert mel_spec.shape == (1, 80, 143) + expected_slice = torch.tensor( + [-6.6119, -6.5963, -6.2776, -6.7496, -6.7096, -6.5131, -6.4643, -6.4817, -6.7185] + ) + assert (mel_spec[0, :3, :3].flatten() - expected_slice).abs().max() < 1e-2 def test_module_from_pipeline(self): model = DiffWave(num_res_layers=4) noise_scheduler = DDPMScheduler(timesteps=12) - bddm = BDDM(model, noise_scheduler) + bddm = BDDMPipeline(model, noise_scheduler) # check if the library name for the diffwave moduel is set to pipeline module self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm") @@ -727,6 +730,6 @@ class PipelineTesterMixin(unittest.TestCase): # check if we can save and load the pipeline with tempfile.TemporaryDirectory() as tmpdirname: bddm.save_pretrained(tmpdirname) - _ = BDDM.from_pretrained(tmpdirname) + _ = BDDMPipeline.from_pretrained(tmpdirname) # check if the same works using the DifusionPipeline class _ = DiffusionPipeline.from_pretrained(tmpdirname) From 40e28e8bf4165c1167148fb825affd57c53b00ea Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 22 Jun 2022 13:42:09 +0000 Subject: [PATCH 08/10] only remove module if necessary --- src/diffusers/pipeline_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 339ebb074a..d73b8d8fb3 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -86,7 +86,7 @@ class DiffusionPipeline(ConfigMixin): model_index_dict = dict(self.config) model_index_dict.pop("_class_name") model_index_dict.pop("_diffusers_version") - model_index_dict.pop("_module") + model_index_dict.pop("_module", None) for pipeline_component_name in model_index_dict.keys(): sub_model = getattr(self, pipeline_component_name) From 3a17775454b80d2b0bceb0de7ac6b444ff288c75 Mon Sep 17 00:00:00 2001 From: Anton Lozhkov Date: Wed, 22 Jun 2022 17:26:07 +0200 Subject: [PATCH 09/10] TODO: Add FID and KID metrics --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 32dc7c8229..7cb20b0e0e 100644 --- a/README.md +++ b/README.md @@ -288,3 +288,4 @@ wavwrite("generated_audio.wav", sampling_rate, audio.squeeze().cpu().numpy()) - [ ] Add more vision models - [ ] Add more speech models - [ ] Add RL model +- [ ] Add FID and KID metrics From 6e456b7a7afa72543cad6503c91d31c6cb793a3a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 22 Jun 2022 18:38:32 +0200 Subject: [PATCH 10/10] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7cb20b0e0e..6c2c9799c2 100644 --- a/README.md +++ b/README.md @@ -253,12 +253,12 @@ image_pil.save("test.png") ```python import torch -from diffusers import BDDMPipeline, GradTTS +from diffusers import BDDMPipeline, GradTTSPipeline torch_device = "cuda" # load grad tts and bddm pipelines -grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts") +grad_tts = GradTTSPipeline.from_pretrained("fusing/grad-tts-libri-tts") bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech") text = "Hello world, I missed you so much."