From e1ef122260d015255b0a7c075fd08ed114670671 Mon Sep 17 00:00:00 2001
From: Narek Maloyan <narek1110@gmail.com>
Date: Mon, 20 Jun 2022 20:11:43 +0000
Subject: [PATCH 01/10] fix alphas_cumprod

---
 src/diffusers/schedulers/scheduling_ddpm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 1831a88bec..bc95c0afa8 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -137,8 +137,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
         return pred_prev_sample
 
     def forward_step(self, original_sample, noise, t):
-        sqrt_alpha_prod = self.alpha_prod_t[t] ** 0.5
-        sqrt_one_minus_alpha_prod = (1 - self.alpha_prod_t[t]) ** 0.5
+        sqrt_alpha_prod = self.alphas_cumprod[t] ** 0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[t]) ** 0.5
         noisy_sample = sqrt_alpha_prod * original_sample + sqrt_one_minus_alpha_prod * noise
         return noisy_sample
 

From ee902ddf3a7c6e56897a425f36fae48efd5926f5 Mon Sep 17 00:00:00 2001
From: Pratik Pingale <pratikbpingale9075@gmail.com>
Date: Tue, 21 Jun 2022 12:53:26 +0530
Subject: [PATCH 02/10] Fix: TODO checklist checkbox

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index bdd504c630..80b3f5ca60 100644
--- a/README.md
+++ b/README.md
@@ -278,13 +278,13 @@ wavwrite("generated_audio.wav", sampling_rate, audio.squeeze().cpu().numpy())
 
 ## TODO
 
-- Create common API for models [ ]
-- Add tests for models [ ]
-- Adapt schedulers for training [ ]
-- Write google colab for training [ ]
-- Write docs / Think about how to structure docs [ ]
-- Add tests to circle ci [ ]
-- Add [Diffusion LM models](https://arxiv.org/pdf/2205.14217.pdf) [ ]
-- Add more vision models [ ]
-- Add more speech models [ ]
-- Add RL model [ ]
+- [ ] Create common API for models
+- [ ] Add tests for models
+- [ ] Adapt schedulers for training
+- [ ] Write google colab for training
+- [ ] Write docs / Think about how to structure docs
+- [ ] Add tests to circle ci
+- [ ] Add [Diffusion LM models](https://arxiv.org/pdf/2205.14217.pdf)
+- [ ] Add more vision models
+- [ ] Add more speech models
+- [ ] Add RL model

From 848c86ca0a2e321cda294c3cd3e8a3572281c2f3 Mon Sep 17 00:00:00 2001
From: anton-l <anton@huggingface.co>
Date: Wed, 22 Jun 2022 13:38:14 +0200
Subject: [PATCH 03/10] batched forward diffusion step

---
 examples/train_unconditional.py             | 16 +++++++---------
 src/diffusers/schedulers/scheduling_ddpm.py | 18 +++++++++++++-----
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/examples/train_unconditional.py b/examples/train_unconditional.py
index fbdc0aba29..79144e9c02 100644
--- a/examples/train_unconditional.py
+++ b/examples/train_unconditional.py
@@ -39,7 +39,7 @@ def main(args):
         resamp_with_conv=True,
         resolution=args.resolution,
     )
-    noise_scheduler = DDPMScheduler(timesteps=1000)
+    noise_scheduler = DDPMScheduler(timesteps=1000, tensor_format="pt")
     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
 
     augmentations = Compose(
@@ -93,15 +93,13 @@ def main(args):
             pbar.set_description(f"Epoch {epoch}")
             for step, batch in enumerate(train_dataloader):
                 clean_images = batch["input"]
-                noisy_images = torch.empty_like(clean_images)
-                noise_samples = torch.empty_like(clean_images)
+                noise_samples = torch.randn(clean_images.shape).to(clean_images.device)
                 bsz = clean_images.shape[0]
-
                 timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_images.device).long()
-                for idx in range(bsz):
-                    noise = torch.randn(clean_images.shape[1:]).to(clean_images.device)
-                    noise_samples[idx] = noise
-                    noisy_images[idx] = noise_scheduler.forward_step(clean_images[idx], noise, timesteps[idx])
+
+                # add noise onto the clean images according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_images = noise_scheduler.training_step(clean_images, noise_samples, timesteps)
 
                 if step % args.gradient_accumulation_steps != 0:
                     with accelerator.no_sync(model):
@@ -146,7 +144,7 @@ def main(args):
             # save image
             test_dir = os.path.join(args.output_dir, "test_samples")
             os.makedirs(test_dir, exist_ok=True)
-            image_pil.save(f"{test_dir}/{epoch}.png")
+            image_pil.save(f"{test_dir}/{epoch:04d}.png")
 
             # save the model
             if args.push_to_hub:
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index eb85796f27..206b1477f2 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -17,6 +17,7 @@
 import math
 
 import numpy as np
+import torch
 
 from ..configuration_utils import ConfigMixin
 from .scheduling_utils import SchedulerMixin
@@ -142,11 +143,18 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
         return pred_prev_sample
 
-    def forward_step(self, original_sample, noise, t):
-        sqrt_alpha_prod = self.alphas_cumprod[t] ** 0.5
-        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[t]) ** 0.5
-        noisy_sample = sqrt_alpha_prod * original_sample + sqrt_one_minus_alpha_prod * noise
-        return noisy_sample
+    def training_step(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor):
+        if timesteps.dim() != 1:
+            raise ValueError("`timesteps` must be a 1D tensor")
+
+        device = original_samples.device
+        batch_size = original_samples.shape[0]
+        timesteps = timesteps.reshape(batch_size, 1, 1, 1)
+
+        sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
+        sqrt_one_minus_alpha_prod = (1 - self.alphas_cumprod[timesteps]) ** 0.5
+        noisy_samples = sqrt_alpha_prod.to(device) * original_samples + sqrt_one_minus_alpha_prod.to(device) * noise
+        return noisy_samples
 
     def __len__(self):
         return self.config.timesteps

From 33abc79515e472d3a705a43ef7405f009bba2f95 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 22 Jun 2022 13:52:45 +0200
Subject: [PATCH 04/10] Update README.md

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index d3d1c1c67e..d806e852e9 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -10,7 +10,7 @@ python -m torch.distributed.launch \
   train_unconditional.py \
   --dataset="huggan/flowers-102-categories" \
   --resolution=64 \
-  --output_path="flowers-ddpm" \
+  --output_dir="flowers-ddpm" \
   --batch_size=16 \
   --num_epochs=100 \
   --gradient_accumulation_steps=1 \
@@ -34,7 +34,7 @@ python -m torch.distributed.launch \
   train_unconditional.py \
   --dataset="huggan/pokemon" \
   --resolution=64 \
-  --output_path="pokemon-ddpm" \
+  --output_dir="pokemon-ddpm" \
   --batch_size=16 \
   --num_epochs=100 \
   --gradient_accumulation_steps=1 \

From d0032c6095a858c2f91166821e344aa3d71ab38b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 22 Jun 2022 12:38:36 +0000
Subject: [PATCH 05/10] refactor naming

---
 README.md                                     |  4 +-
 scripts/conversion_glide.py                   | 10 ++--
 src/diffusers/__init__.py                     | 12 +++--
 src/diffusers/models/__init__.py              |  2 +-
 src/diffusers/models/unet_glide.py            |  6 +--
 src/diffusers/pipelines/__init__.py           |  9 ++--
 src/diffusers/pipelines/grad_tts_utils.py     | 22 ++-------
 src/diffusers/pipelines/pipeline_glide.py     |  8 ++--
 src/diffusers/schedulers/scheduling_ddim.py   |  4 +-
 src/diffusers/schedulers/scheduling_ddpm.py   |  4 +-
 src/diffusers/schedulers/scheduling_pndm.py   |  2 +-
 src/diffusers/utils/__init__.py               | 38 +++++++++++++++
 ...rmers_and_inflect_and_unidecode_objects.py | 10 ++++
 .../utils/dummy_transformers_objects.py       | 11 ++---
 tests/test_modeling_utils.py                  | 46 ++++++++++++-------
 utils/check_dummies.py                        |  5 +-
 16 files changed, 122 insertions(+), 71 deletions(-)
 create mode 100644 src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py

diff --git a/README.md b/README.md
index 80b3f5ca60..f6889baf92 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The class provides functionality to compute previous image according to alpha, b
     
 
 **Diffusion Pipeline**: End-to-end pipeline that includes multiple diffusion models, possible text encoders, ...
-*Examples*: GLIDE, Latent-Diffusion, Imagen, DALL-E 2
+*Examples*: Glide, Latent-Diffusion, Imagen, DALL-E 2
 
 <p align="center">
     <img src="https://user-images.githubusercontent.com/10695622/174348898-481bd7c2-5457-4830-89bc-f0907756f64c.jpeg" width="550"/>
@@ -190,7 +190,7 @@ image_pil.save("test.png")
 
 [Diffuser](https://diffusion-planning.github.io/) for planning in reinforcement learning: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1TmBmlYeKUZSkUZoJqfBmaicVTKx6nN1R?usp=sharing)
 
-### 2. `diffusers` as a collection of popular Diffusion systems (GLIDE, Dalle, ...)
+### 2. `diffusers` as a collection of popular Diffusion systems (Glide, Dalle, ...)
 
 For more examples see [pipelines](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines).
 
diff --git a/scripts/conversion_glide.py b/scripts/conversion_glide.py
index 2d04580e76..6cf0133db6 100644
--- a/scripts/conversion_glide.py
+++ b/scripts/conversion_glide.py
@@ -1,8 +1,8 @@
 import torch
 from torch import nn
 
-from diffusers import ClassifierFreeGuidanceScheduler, DDIMScheduler, GLIDESuperResUNetModel, GLIDETextToImageUNetModel
-from diffusers.pipelines.pipeline_glide import GLIDE, CLIPTextModel
+from diffusers import ClassifierFreeGuidanceScheduler, DDIMScheduler, GlideSuperResUNetModel, GlideTextToImageUNetModel
+from diffusers.pipelines.pipeline_glide import Glide, CLIPTextModel
 from transformers import CLIPTextConfig, GPT2Tokenizer
 
 
@@ -55,7 +55,7 @@ for layer_idx in range(config.num_hidden_layers):
 
 ### Convert the Text-to-Image UNet
 
-text2im_model = GLIDETextToImageUNetModel(
+text2im_model = GlideTextToImageUNetModel(
     in_channels=3,
     model_channels=192,
     out_channels=6,
@@ -80,7 +80,7 @@ text_scheduler = ClassifierFreeGuidanceScheduler(timesteps=1000, beta_schedule="
 # wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/upsample.pt
 ups_state_dict = torch.load("upsample.pt", map_location="cpu")
 
-superres_model = GLIDESuperResUNetModel(
+superres_model = GlideSuperResUNetModel(
     in_channels=6,
     model_channels=192,
     out_channels=6,
@@ -101,7 +101,7 @@ upscale_scheduler = DDIMScheduler(
     timesteps=1000, beta_schedule="linear", beta_start=0.0001, beta_end=0.02, tensor_format="pt"
 )
 
-glide = GLIDE(
+glide = Glide(
     text_unet=text2im_model,
     text_noise_scheduler=text_scheduler,
     text_encoder=model,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 881d48240e..efb89e8597 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -1,7 +1,7 @@
 # flake8: noqa
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
-from .utils import is_transformers_available
+from .utils import is_inflect_available, is_transformers_available, is_unidecode_available
 
 
 __version__ = "0.0.4"
@@ -16,8 +16,14 @@ from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMSche
 
 
 if is_transformers_available():
-    from .models.unet_glide import GLIDESuperResUNetModel, GLIDETextToImageUNetModel, GLIDEUNetModel
+    from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
     from .models.unet_grad_tts import UNetGradTTSModel
-    from .pipelines import GLIDE, GradTTS, LatentDiffusion
+    from .pipelines import Glide, LatentDiffusion
 else:
     from .utils.dummy_transformers_objects import *
+
+
+if is_transformers_available() and is_inflect_available() and is_unidecode_available():
+    from .pipelines import GradTTS
+else:
+    from .utils.dummy_transformers_and_inflect_and_unidecode_objects import *
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 20fb9bfeba..3f0c78b3c6 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 
 from .unet import UNetModel
-from .unet_glide import GLIDESuperResUNetModel, GLIDETextToImageUNetModel, GLIDEUNetModel
+from .unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
 from .unet_grad_tts import UNetGradTTSModel
 from .unet_ldm import UNetLDMModel
 from .unet_rl import TemporalUNet
diff --git a/src/diffusers/models/unet_glide.py b/src/diffusers/models/unet_glide.py
index abbd7dae12..648ff9c34a 100644
--- a/src/diffusers/models/unet_glide.py
+++ b/src/diffusers/models/unet_glide.py
@@ -388,7 +388,7 @@ class QKVAttention(nn.Module):
         return a.reshape(bs, -1, length)
 
 
-class GLIDEUNetModel(ModelMixin, ConfigMixin):
+class GlideUNetModel(ModelMixin, ConfigMixin):
     """
     The full UNet model with attention and timestep embedding.
 
@@ -641,7 +641,7 @@ class GLIDEUNetModel(ModelMixin, ConfigMixin):
         return self.out(h)
 
 
-class GLIDETextToImageUNetModel(GLIDEUNetModel):
+class GlideTextToImageUNetModel(GlideUNetModel):
     """
     A UNetModel that performs super-resolution.
 
@@ -734,7 +734,7 @@ class GLIDETextToImageUNetModel(GLIDEUNetModel):
         return self.out(h)
 
 
-class GLIDESuperResUNetModel(GLIDEUNetModel):
+class GlideSuperResUNetModel(GlideUNetModel):
     """
     A UNetModel that performs super-resolution.
 
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index e6e753e8e2..7ba126b03b 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -1,4 +1,4 @@
-from ..utils import is_transformers_available
+from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available
 from .pipeline_bddm import BDDM
 from .pipeline_ddim import DDIM
 from .pipeline_ddpm import DDPM
@@ -6,6 +6,9 @@ from .pipeline_pndm import PNDM
 
 
 if is_transformers_available():
-    from .pipeline_glide import GLIDE
-    from .pipeline_grad_tts import GradTTS
+    from .pipeline_glide import Glide
     from .pipeline_latent_diffusion import LatentDiffusion
+
+
+if is_transformers_available() and is_unidecode_available() and is_inflect_available():
+    from .pipeline_grad_tts import GradTTS
diff --git a/src/diffusers/pipelines/grad_tts_utils.py b/src/diffusers/pipelines/grad_tts_utils.py
index d0d6b89ce8..15995b85c8 100644
--- a/src/diffusers/pipelines/grad_tts_utils.py
+++ b/src/diffusers/pipelines/grad_tts_utils.py
@@ -6,20 +6,9 @@ from shutil import copyfile
 
 import torch
 
+import inflect
 from transformers import PreTrainedTokenizer
-
-
-try:
-    from unidecode import unidecode
-except:
-    print("unidecode is not installed")
-    pass
-
-try:
-    import inflect
-except:
-    print("inflect is not installed")
-    pass
+from unidecode import unidecode
 
 
 valid_symbols = [
@@ -234,12 +223,7 @@ def english_cleaners(text):
     return text
 
 
-try:
-    _inflect = inflect.engine()
-except:
-    print("inflect is not installed")
-    _inflect = None
-
+_inflect = inflect.engine()
 _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
 _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py
index 07603e153e..0046055349 100644
--- a/src/diffusers/pipelines/pipeline_glide.py
+++ b/src/diffusers/pipelines/pipeline_glide.py
@@ -30,7 +30,7 @@ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPo
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import ModelOutput, add_start_docstrings_to_model_forward, replace_return_docstrings
 
-from ..models import GLIDESuperResUNetModel, GLIDETextToImageUNetModel
+from ..models import GlideSuperResUNetModel, GlideTextToImageUNetModel
 from ..pipeline_utils import DiffusionPipeline
 from ..schedulers import DDIMScheduler, DDPMScheduler
 from ..utils import logging
@@ -711,14 +711,14 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
     return res + torch.zeros(broadcast_shape, device=timesteps.device)
 
 
-class GLIDE(DiffusionPipeline):
+class Glide(DiffusionPipeline):
     def __init__(
         self,
-        text_unet: GLIDETextToImageUNetModel,
+        text_unet: GlideTextToImageUNetModel,
         text_noise_scheduler: DDPMScheduler,
         text_encoder: CLIPTextModel,
         tokenizer: GPT2Tokenizer,
-        upscale_unet: GLIDESuperResUNetModel,
+        upscale_unet: GlideSuperResUNetModel,
         upscale_noise_scheduler: DDIMScheduler,
     ):
         super().__init__()
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 1038db2876..d11af4ec25 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -73,7 +73,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
         if beta_schedule == "linear":
             self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32)
         elif beta_schedule == "squaredcos_cap_v2":
-            # GLIDE cosine schedule
+            # Glide cosine schedule
             self.betas = betas_for_alpha_bar(timesteps)
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
@@ -132,7 +132,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
         std_dev_t = eta * variance ** (0.5)
 
         if use_clipped_residual:
-            # the residual is always re-derived from the clipped x_0 in GLIDE
+            # the residual is always re-derived from the clipped x_0 in Glide
             residual = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
 
         # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index eb85796f27..d6f91c918f 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -76,7 +76,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
         elif beta_schedule == "linear":
             self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32)
         elif beta_schedule == "squaredcos_cap_v2":
-            # GLIDE cosine schedule
+            # Glide cosine schedule
             self.betas = betas_for_alpha_bar(timesteps)
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
@@ -108,7 +108,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
         elif variance_type == "fixed_large":
             variance = self.betas[t]
         elif variance_type == "fixed_large_log":
-            # GLIDE max_log
+            # Glide max_log
             variance = self.log(self.betas[t])
 
         return variance
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index 10679f5c6b..e7479d5497 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -66,7 +66,7 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
         if beta_schedule == "linear":
             self.betas = np.linspace(beta_start, beta_end, timesteps, dtype=np.float32)
         elif beta_schedule == "squaredcos_cap_v2":
-            # GLIDE cosine schedule
+            # Glide cosine schedule
             self.betas = betas_for_alpha_bar(timesteps)
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index c5a6e223d7..470526a8b5 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -45,10 +45,34 @@ except importlib_metadata.PackageNotFoundError:
     _transformers_available = False
 
 
+_inflect_available = importlib.util.find_spec("inflect") is not None
+try:
+    _inflect_version = importlib_metadata.version("inflect")
+    logger.debug(f"Successfully imported inflect version {_inflect_version}")
+except importlib_metadata.PackageNotFoundError:
+    _inflect_available = False
+
+
+_unidecode_available = importlib.util.find_spec("unidecode") is not None
+try:
+    _unidecode_version = importlib_metadata.version("unidecode")
+    logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
+except importlib_metadata.PackageNotFoundError:
+    _unidecode_available = False
+
+
 def is_transformers_available():
     return _transformers_available
 
 
+def is_inflect_available():
+    return _inflect_available
+
+
+def is_unidecode_available():
+    return _unidecode_available
+
+
 class RepositoryNotFoundError(HTTPError):
     """
     Raised when trying to access a hf.co URL with an invalid repository name, or with a private repo name the user does
@@ -70,9 +94,23 @@ TRANSFORMERS_IMPORT_ERROR = """
 """
 
 
+UNIDECODE_IMPORT_ERROR = """
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip:
+`pip install Unidecode`
+"""
+
+
+INFLECT_IMPORT_ERROR = """
+{0} requires the inflect library but it was not found in your environment. You can install it with pip:
+`pip install inflect`
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)),
+        ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+        ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
new file mode 100644
index 0000000000..320a93134a
--- /dev/null
+++ b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
@@ -0,0 +1,10 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+# flake8: noqa
+from ..utils import DummyObject, requires_backends
+
+
+class GradTTS(metaclass=DummyObject):
+    _backends = ["transformers", "inflect", "unidecode"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["transformers", "inflect", "unidecode"])
diff --git a/src/diffusers/utils/dummy_transformers_objects.py b/src/diffusers/utils/dummy_transformers_objects.py
index 6466df193e..1efb17297f 100644
--- a/src/diffusers/utils/dummy_transformers_objects.py
+++ b/src/diffusers/utils/dummy_transformers_objects.py
@@ -3,21 +3,21 @@
 from ..utils import DummyObject, requires_backends
 
 
-class GLIDESuperResUNetModel(metaclass=DummyObject):
+class GlideSuperResUNetModel(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["transformers"])
 
 
-class GLIDETextToImageUNetModel(metaclass=DummyObject):
+class GlideTextToImageUNetModel(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["transformers"])
 
 
-class GLIDEUNetModel(metaclass=DummyObject):
+class GlideUNetModel(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
@@ -31,10 +31,7 @@ class UNetGradTTSModel(metaclass=DummyObject):
         requires_backends(self, ["transformers"])
 
 
-GLIDE = None
-
-
-class GradTTS(metaclass=DummyObject):
+class Glide(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index a58759b297..372435de9d 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -21,17 +21,17 @@ import unittest
 import numpy as np
 import torch
 
-import pytest
 from diffusers import (
     BDDM,
     DDIM,
     DDPM,
-    GLIDE,
+    Glide,
     PNDM,
     DDIMScheduler,
     DDPMScheduler,
-    GLIDESuperResUNetModel,
-    GLIDETextToImageUNetModel,
+    GlideSuperResUNetModel,
+    GlideTextToImageUNetModel,
+    GradTTS,
     LatentDiffusion,
     PNDMScheduler,
     UNetGradTTSModel,
@@ -247,13 +247,13 @@ class UnetModelTests(ModelTesterMixin, unittest.TestCase):
 
         output_slice = output[0, -1, -3:, -3:].flatten()
         # fmt: off
-        expected_output_slice = torch.tensor([ 0.2891, -0.1899,  0.2595, -0.6214,  0.0968, -0.2622,  0.4688,  0.1311, 0.0053])
+        expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053])
         # fmt: on
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
 
-class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase):
-    model_class = GLIDESuperResUNetModel
+class GlideSuperResUNetTests(ModelTesterMixin, unittest.TestCase):
+    model_class = GlideSuperResUNetModel
 
     @property
     def dummy_input(self):
@@ -309,7 +309,7 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase):
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_from_pretrained_hub(self):
-        model, loading_info = GLIDESuperResUNetModel.from_pretrained(
+        model, loading_info = GlideSuperResUNetModel.from_pretrained(
             "fusing/glide-super-res-dummy", output_loading_info=True
         )
         self.assertIsNotNone(model)
@@ -321,7 +321,7 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = GLIDESuperResUNetModel.from_pretrained("fusing/glide-super-res-dummy")
+        model = GlideSuperResUNetModel.from_pretrained("fusing/glide-super-res-dummy")
 
         torch.manual_seed(0)
         if torch.cuda.is_available():
@@ -342,8 +342,8 @@ class GLIDESuperResUNetTests(ModelTesterMixin, unittest.TestCase):
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
 
-class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = GLIDETextToImageUNetModel
+class GlideTextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = GlideTextToImageUNetModel
 
     @property
     def dummy_input(self):
@@ -401,7 +401,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
         self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
 
     def test_from_pretrained_hub(self):
-        model, loading_info = GLIDETextToImageUNetModel.from_pretrained(
+        model, loading_info = GlideTextToImageUNetModel.from_pretrained(
             "fusing/unet-glide-text2im-dummy", output_loading_info=True
         )
         self.assertIsNotNone(model)
@@ -413,7 +413,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = GLIDETextToImageUNetModel.from_pretrained("fusing/unet-glide-text2im-dummy")
+        model = GlideTextToImageUNetModel.from_pretrained("fusing/unet-glide-text2im-dummy")
 
         torch.manual_seed(0)
         if torch.cuda.is_available():
@@ -431,7 +431,7 @@ class GLIDETextToImageUNetModelTests(ModelTesterMixin, unittest.TestCase):
         output, _ = torch.split(output, 3, dim=1)
         output_slice = output[0, -1, -3:, -3:].flatten()
         # fmt: off
-        expected_output_slice = torch.tensor([  2.7766, -10.3558, -14.9149,  -0.9376, -14.9175, -17.7679,  -5.5565, -12.9521, -12.9845])
+        expected_output_slice = torch.tensor([2.7766, -10.3558, -14.9149, -0.9376, -14.9175, -17.7679, -5.5565, -12.9521, -12.9845])
         # fmt: on
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
 
@@ -571,7 +571,7 @@ class UNetGradTTSModelTests(ModelTesterMixin, unittest.TestCase):
 
         output_slice = output[0, -3:, -3:].flatten()
         # fmt: off
-        expected_output_slice = torch.tensor([-0.0690, -0.0531,  0.0633, -0.0660, -0.0541,  0.0650, -0.0656, -0.0555, 0.0617])
+        expected_output_slice = torch.tensor([-0.0690, -0.0531, 0.0633, -0.0660, -0.0541, 0.0650, -0.0656, -0.0555, 0.0617])
         # fmt: on
 
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
@@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase):
     @slow
     def test_glide_text2img(self):
         model_id = "fusing/glide-base"
-        glide = GLIDE.from_pretrained(model_id)
+        glide = Glide.from_pretrained(model_id)
 
         prompt = "a pencil sketch of a corgi"
         generator = torch.manual_seed(0)
@@ -701,6 +701,20 @@ class PipelineTesterMixin(unittest.TestCase):
         expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
         assert (image_slice.flatten() - expected_slice).abs().max() < 1e-2
 
+    @slow
+    def test_grad_tts(self):
+        model_id = "fusing/grad-tts-libri-tts"
+        grad_tts = GradTTS.from_pretrained(model_id)
+
+        text = "Hello world, I missed you so much."
+
+        # generate mel spectograms using text
+        mel_spec = grad_tts(text)
+
+        assert mel_spec.shape == (1, 256, 256, 3)
+        expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
+        assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2
+
     def test_module_from_pipeline(self):
         model = DiffWave(num_res_layers=4)
         noise_scheduler = DDPMScheduler(timesteps=12)
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index e132b3499c..f9a45284f3 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -23,10 +23,9 @@ import re
 PATH_TO_DIFFUSERS = "src/diffusers"
 
 # Matches is_xxx_available()
-_re_backend = re.compile(r"if is\_([a-z_]*)_available\(\)")
+_re_backend = re.compile(r"is\_([a-z_]*)_available\(\)")
 # Matches from xxx import bla
 _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n")
-_re_test_backend = re.compile(r"^\s+if\s+not\s+is\_[a-z]*\_available\(\)")
 
 
 DUMMY_CONSTANT = """
@@ -54,7 +53,7 @@ def find_backend(line):
     if len(backends) == 0:
         return None
 
-    return backends[0]
+    return "_and_".join(backends)
 
 
 def read_init():

From 7b43035bcb2ff788ce10fb64c81e61bd4bcf6433 Mon Sep 17 00:00:00 2001
From: anton-l <anton@huggingface.co>
Date: Wed, 22 Jun 2022 15:15:54 +0200
Subject: [PATCH 06/10] init text2im script

---
 examples/train_latent_text_to_image.py    | 202 ++++++++++++++++++++++
 src/diffusers/pipelines/pipeline_glide.py |  16 --
 2 files changed, 202 insertions(+), 16 deletions(-)
 create mode 100644 examples/train_latent_text_to_image.py

diff --git a/examples/train_latent_text_to_image.py b/examples/train_latent_text_to_image.py
new file mode 100644
index 0000000000..fd823fdad9
--- /dev/null
+++ b/examples/train_latent_text_to_image.py
@@ -0,0 +1,202 @@
+import argparse
+import os
+
+import torch
+import torch.nn.functional as F
+
+import PIL.Image
+from accelerate import Accelerator
+from datasets import load_dataset
+from diffusers import DDPM, DDPMScheduler, UNetLDMModel
+from diffusers.hub_utils import init_git_repo, push_to_hub
+from diffusers.modeling_utils import unwrap_model
+from diffusers.optimization import get_scheduler
+from diffusers.utils import logging
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    InterpolationMode,
+    Lambda,
+    RandomHorizontalFlip,
+    Resize,
+    ToTensor,
+)
+from tqdm.auto import tqdm
+
+
+logger = logging.get_logger(__name__)
+
+
+def main(args):
+    accelerator = Accelerator(mixed_precision=args.mixed_precision)
+
+    model = UNetLDMModel(
+        attention_resolutions=[4, 2, 1],
+        channel_mult=[1, 2, 4, 4],
+        context_dim=1280,
+        conv_resample=True,
+        dims=2,
+        dropout=0,
+        image_size=32,
+        in_channels=4,
+        model_channels=320,
+        num_heads=8,
+        num_res_blocks=2,
+        out_channels=4,
+        resblock_updown=False,
+        transformer_depth=1,
+        use_new_attention_order=False,
+        use_scale_shift_norm=False,
+        use_spatial_transformer=True,
+        legacy=False,
+    )
+    noise_scheduler = DDPMScheduler(timesteps=1000, tensor_format="pt")
+    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
+
+    augmentations = Compose(
+        [
+            Resize(args.resolution, interpolation=InterpolationMode.BILINEAR),
+            CenterCrop(args.resolution),
+            RandomHorizontalFlip(),
+            ToTensor(),
+            Lambda(lambda x: x * 2 - 1),
+        ]
+    )
+    dataset = load_dataset(args.dataset, split="train")
+
+    def transforms(examples):
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
+        return {"input": images}
+
+    dataset.set_transform(transforms)
+    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
+
+    lr_scheduler = get_scheduler(
+        "linear",
+        optimizer=optimizer,
+        num_warmup_steps=args.warmup_steps,
+        num_training_steps=(len(train_dataloader) * args.num_epochs) // args.gradient_accumulation_steps,
+    )
+
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.push_to_hub:
+        repo = init_git_repo(args, at_init=True)
+
+    # Train!
+    is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+    world_size = torch.distributed.get_world_size() if is_distributed else 1
+    total_train_batch_size = args.batch_size * args.gradient_accumulation_steps * world_size
+    max_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_epochs
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataloader.dataset)}")
+    logger.info(f"  Num Epochs = {args.num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_steps}")
+
+    for epoch in range(args.num_epochs):
+        model.train()
+        with tqdm(total=len(train_dataloader), unit="ba") as pbar:
+            pbar.set_description(f"Epoch {epoch}")
+            for step, batch in enumerate(train_dataloader):
+                clean_images = batch["input"]
+                noise_samples = torch.randn(clean_images.shape).to(clean_images.device)
+                bsz = clean_images.shape[0]
+                timesteps = torch.randint(0, noise_scheduler.timesteps, (bsz,), device=clean_images.device).long()
+
+                # add noise onto the clean images according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_images = noise_scheduler.training_step(clean_images, noise_samples, timesteps)
+
+                if step % args.gradient_accumulation_steps != 0:
+                    with accelerator.no_sync(model):
+                        output = model(noisy_images, timesteps)
+                        # predict the noise residual
+                        loss = F.mse_loss(output, noise_samples)
+                        loss = loss / args.gradient_accumulation_steps
+                        accelerator.backward(loss)
+                else:
+                    output = model(noisy_images, timesteps)
+                    # predict the noise residual
+                    loss = F.mse_loss(output, noise_samples)
+                    loss = loss / args.gradient_accumulation_steps
+                    accelerator.backward(loss)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+                pbar.update(1)
+                pbar.set_postfix(loss=loss.detach().item(), lr=optimizer.param_groups[0]["lr"])
+
+                optimizer.step()
+        if is_distributed:
+            torch.distributed.barrier()
+
+        # Generate a sample image for visual inspection
+        if args.local_rank in [-1, 0]:
+            model.eval()
+            with torch.no_grad():
+                pipeline = DDPM(unet=unwrap_model(model), noise_scheduler=noise_scheduler)
+
+                generator = torch.manual_seed(0)
+                # run pipeline in inference (sample random noise and denoise)
+                image = pipeline(generator=generator)
+
+            # process image to PIL
+            image_processed = image.cpu().permute(0, 2, 3, 1)
+            image_processed = (image_processed + 1.0) * 127.5
+            image_processed = image_processed.type(torch.uint8).numpy()
+            image_pil = PIL.Image.fromarray(image_processed[0])
+
+            # save image
+            test_dir = os.path.join(args.output_dir, "test_samples")
+            os.makedirs(test_dir, exist_ok=True)
+            image_pil.save(f"{test_dir}/{epoch:04d}.png")
+
+            # save the model
+            if args.push_to_hub:
+                push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False)
+            else:
+                pipeline.save_pretrained(args.output_dir)
+        if is_distributed:
+            torch.distributed.barrier()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument("--local_rank", type=int, default=-1)
+    parser.add_argument("--dataset", type=str, default="huggan/flowers-102-categories")
+    parser.add_argument("--output_dir", type=str, default="ddpm-model")
+    parser.add_argument("--overwrite_output_dir", action="store_true")
+    parser.add_argument("--resolution", type=int, default=64)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--num_epochs", type=int, default=100)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--warmup_steps", type=int, default=500)
+    parser.add_argument("--push_to_hub", action="store_true")
+    parser.add_argument("--hub_token", type=str, default=None)
+    parser.add_argument("--hub_model_id", type=str, default=None)
+    parser.add_argument("--hub_private_repo", action="store_true")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    main(args)
diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py
index 07603e153e..d3706b74e5 100644
--- a/src/diffusers/pipelines/pipeline_glide.py
+++ b/src/diffusers/pipelines/pipeline_glide.py
@@ -695,22 +695,6 @@ class CLIPTextModel(CLIPPreTrainedModel):
 #####################
 
 
-def _extract_into_tensor(arr, timesteps, broadcast_shape):
-    """
-    Extract values from a 1-D numpy array for a batch of indices.
-
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
-                            dimension equal to the length of timesteps.
-    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
-    """
-    res = torch.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
-    while len(res.shape) < len(broadcast_shape):
-        res = res[..., None]
-    return res + torch.zeros(broadcast_shape, device=timesteps.device)
-
-
 class GLIDE(DiffusionPipeline):
     def __init__(
         self,

From 48269070d23ad8a4c6f31bc6847c358aac182ad1 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 22 Jun 2022 13:40:08 +0000
Subject: [PATCH 07/10] more fixes

---
 README.md                                     |  8 ++--
 src/diffusers/__init__.py                     |  6 +--
 src/diffusers/pipeline_utils.py               | 26 ++--------
 src/diffusers/pipelines/README.md             |  2 +-
 src/diffusers/pipelines/__init__.py           | 14 +++---
 src/diffusers/pipelines/pipeline_bddm.py      |  2 +-
 src/diffusers/pipelines/pipeline_ddim.py      |  2 +-
 src/diffusers/pipelines/pipeline_ddpm.py      |  2 +-
 src/diffusers/pipelines/pipeline_glide.py     |  2 +-
 src/diffusers/pipelines/pipeline_grad_tts.py  | 13 +++--
 .../pipelines/pipeline_latent_diffusion.py    |  2 +-
 src/diffusers/pipelines/pipeline_pndm.py      |  2 +-
 tests/test_modeling_utils.py                  | 47 ++++++++++---------
 13 files changed, 59 insertions(+), 69 deletions(-)

diff --git a/README.md b/README.md
index f6889baf92..32dc7c8229 100644
--- a/README.md
+++ b/README.md
@@ -249,24 +249,24 @@ image_pil = PIL.Image.fromarray(image_processed[0])
 image_pil.save("test.png")
 ```
 
-#### **Text to speech with GradTTS and BDDM**
+#### **Text to speech with GradTTS and BDDMPipeline**
 
 ```python
 import torch
-from diffusers import BDDM, GradTTS
+from diffusers import BDDMPipeline, GradTTS
 
 torch_device = "cuda"
 
 # load grad tts and bddm pipelines
 grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts")
-bddm = BDDM.from_pretrained("fusing/diffwave-vocoder-ljspeech")
+bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech")
 
 text = "Hello world, I missed you so much."
 
 # generate mel spectograms using text
 mel_spec = grad_tts(text, torch_device=torch_device)
 
-#  generate the speech by passing mel spectograms to BDDM pipeline
+#  generate the speech by passing mel spectograms to BDDMPipeline pipeline
 generator = torch.manual_seed(42)
 audio = bddm(mel_spec, generator, torch_device=torch_device)
 
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index efb89e8597..aaca3d347b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -11,19 +11,19 @@ from .models.unet import UNetModel
 from .models.unet_ldm import UNetLDMModel
 from .models.unet_rl import TemporalUNet
 from .pipeline_utils import DiffusionPipeline
-from .pipelines import BDDM, DDIM, DDPM, PNDM
+from .pipelines import BDDMPipeline, DDIMPipeline, DDPMPipeline, PNDMPipeline
 from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMScheduler, SchedulerMixin
 
 
 if is_transformers_available():
     from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
     from .models.unet_grad_tts import UNetGradTTSModel
-    from .pipelines import Glide, LatentDiffusion
+    from .pipelines import GlidePipeline, LatentDiffusionPipeline
 else:
     from .utils.dummy_transformers_objects import *
 
 
 if is_transformers_available() and is_inflect_available() and is_unidecode_available():
-    from .pipelines import GradTTS
+    from .pipelines import GradTTSPipeline
 else:
     from .utils.dummy_transformers_and_inflect_and_unidecode_objects import *
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index d8a2644dc9..339ebb074a 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -21,7 +21,6 @@ from typing import Optional, Union
 from huggingface_hub import snapshot_download
 
 from .configuration_utils import ConfigMixin
-from .dynamic_modules_utils import get_class_from_dynamic_module
 from .utils import DIFFUSERS_CACHE, logging
 
 
@@ -81,9 +80,6 @@ class DiffusionPipeline(ConfigMixin):
             # set models
             setattr(self, name, module)
 
-        register_dict = {"_module": self.__module__.split(".")[-1]}
-        self.register_to_config(**register_dict)
-
     def save_pretrained(self, save_directory: Union[str, os.PathLike]):
         self.save_config(save_directory)
 
@@ -139,11 +135,7 @@ class DiffusionPipeline(ConfigMixin):
 
         config_dict = cls.get_config_dict(cached_folder)
 
-        # 2. Get class name and module candidates to load custom models
-        module_candidate_name = config_dict["_module"]
-        module_candidate = module_candidate_name + ".py"
-
-        # 3. Load the pipeline class, if using custom module then load it from the hub
+        # 2. Load the pipeline class, if using custom module then load it from the hub
         # if we load from explicit class, let's use it
         if cls != DiffusionPipeline:
             pipeline_class = cls
@@ -151,11 +143,6 @@ class DiffusionPipeline(ConfigMixin):
             diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
             pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
 
-            # (TODO - we should allow to load custom pipelines
-            # else we need to load the correct module from the Hub
-            # module = module_candidate
-            # pipeline_class = get_class_from_dynamic_module(cached_folder, module, class_name_, cached_folder)
-
         init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
 
         init_kwargs = {}
@@ -163,7 +150,7 @@ class DiffusionPipeline(ConfigMixin):
         # import it here to avoid circular import
         from diffusers import pipelines
 
-        # 4. Load each module in the pipeline
+        # 3. Load each module in the pipeline
         for name, (library_name, class_name) in init_dict.items():
             is_pipeline_module = hasattr(pipelines, library_name)
             # if the model is in a pipeline module, then we load it from the pipeline
@@ -171,14 +158,7 @@ class DiffusionPipeline(ConfigMixin):
                 pipeline_module = getattr(pipelines, library_name)
                 class_obj = getattr(pipeline_module, class_name)
                 importable_classes = ALL_IMPORTABLE_CLASSES
-                class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()}
-            elif library_name == module_candidate_name:
-                # if the model is not in diffusers or transformers, we need to load it from the hub
-                # assumes that it's a subclass of ModelMixin
-                class_obj = get_class_from_dynamic_module(cached_folder, module_candidate, class_name, cached_folder)
-                # since it's not from a library, we need to check class candidates for all importable classes
-                importable_classes = ALL_IMPORTABLE_CLASSES
-                class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()}
+                class_candidates = {c: class_obj for c in importable_classes.keys()}
             else:
                 # else we just import it from the library.
                 library = importlib.import_module(library_name)
diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md
index 61e653a80f..c0558d35b9 100644
--- a/src/diffusers/pipelines/README.md
+++ b/src/diffusers/pipelines/README.md
@@ -15,5 +15,5 @@ TODO(Patrick, Anton, Suraj)
 - PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py).
 - Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py).
 - Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py).
-- BDDM for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py).
+- BDDMPipeline for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py).
 - Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py).
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 7ba126b03b..d26c5fc8a7 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -1,14 +1,14 @@
 from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available
-from .pipeline_bddm import BDDM
-from .pipeline_ddim import DDIM
-from .pipeline_ddpm import DDPM
-from .pipeline_pndm import PNDM
+from .pipeline_bddm import BDDMPipeline
+from .pipeline_ddim import DDIMPipeline
+from .pipeline_ddpm import DDPMPipeline
+from .pipeline_pndm import PNDMPipeline
 
 
 if is_transformers_available():
-    from .pipeline_glide import Glide
-    from .pipeline_latent_diffusion import LatentDiffusion
+    from .pipeline_glide import GlidePipeline
+    from .pipeline_latent_diffusion import LatentDiffusionPipeline
 
 
 if is_transformers_available() and is_unidecode_available() and is_inflect_available():
-    from .pipeline_grad_tts import GradTTS
+    from .pipeline_grad_tts import GradTTSPipeline
diff --git a/src/diffusers/pipelines/pipeline_bddm.py b/src/diffusers/pipelines/pipeline_bddm.py
index 3ca79c3dee..8b24cb9ceb 100644
--- a/src/diffusers/pipelines/pipeline_bddm.py
+++ b/src/diffusers/pipelines/pipeline_bddm.py
@@ -271,7 +271,7 @@ class DiffWave(ModelMixin, ConfigMixin):
         return self.final_conv(x)
 
 
-class BDDM(DiffusionPipeline):
+class BDDMPipeline(DiffusionPipeline):
     def __init__(self, diffwave, noise_scheduler):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
diff --git a/src/diffusers/pipelines/pipeline_ddim.py b/src/diffusers/pipelines/pipeline_ddim.py
index 272d3edb6b..8da24dbf8f 100644
--- a/src/diffusers/pipelines/pipeline_ddim.py
+++ b/src/diffusers/pipelines/pipeline_ddim.py
@@ -21,7 +21,7 @@ import tqdm
 from ..pipeline_utils import DiffusionPipeline
 
 
-class DDIM(DiffusionPipeline):
+class DDIMPipeline(DiffusionPipeline):
     def __init__(self, unet, noise_scheduler):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
diff --git a/src/diffusers/pipelines/pipeline_ddpm.py b/src/diffusers/pipelines/pipeline_ddpm.py
index ebcce77337..9cf83bfb75 100644
--- a/src/diffusers/pipelines/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/pipeline_ddpm.py
@@ -21,7 +21,7 @@ import tqdm
 from ..pipeline_utils import DiffusionPipeline
 
 
-class DDPM(DiffusionPipeline):
+class DDPMPipeline(DiffusionPipeline):
     def __init__(self, unet, noise_scheduler):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py
index 0046055349..8680b7542a 100644
--- a/src/diffusers/pipelines/pipeline_glide.py
+++ b/src/diffusers/pipelines/pipeline_glide.py
@@ -711,7 +711,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
     return res + torch.zeros(broadcast_shape, device=timesteps.device)
 
 
-class Glide(DiffusionPipeline):
+class GlidePipeline(DiffusionPipeline):
     def __init__(
         self,
         text_unet: GlideTextToImageUNetModel,
diff --git a/src/diffusers/pipelines/pipeline_grad_tts.py b/src/diffusers/pipelines/pipeline_grad_tts.py
index 4201124923..51c861a262 100644
--- a/src/diffusers/pipelines/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/pipeline_grad_tts.py
@@ -420,7 +420,7 @@ class TextEncoder(ModelMixin, ConfigMixin):
         return mu, logw, x_mask
 
 
-class GradTTS(DiffusionPipeline):
+class GradTTSPipeline(DiffusionPipeline):
     def __init__(self, unet, text_encoder, noise_scheduler, tokenizer):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
@@ -430,7 +430,14 @@ class GradTTS(DiffusionPipeline):
 
     @torch.no_grad()
     def __call__(
-        self, text, num_inference_steps=50, temperature=1.3, length_scale=0.91, speaker_id=15, torch_device=None
+        self,
+        text,
+        num_inference_steps=50,
+        temperature=1.3,
+        length_scale=0.91,
+        speaker_id=15,
+        torch_device=None,
+        generator=None,
     ):
         if torch_device is None:
             torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -464,7 +471,7 @@ class GradTTS(DiffusionPipeline):
         mu_y = mu_y.transpose(1, 2)
 
         # Sample latent representation from terminal distribution N(mu_y, I)
-        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature
+        z = mu_y + torch.randn(mu_y.shape, device=mu_y.device, generator=generator) / temperature
 
         xt = z * y_mask
         h = 1.0 / num_inference_steps
diff --git a/src/diffusers/pipelines/pipeline_latent_diffusion.py b/src/diffusers/pipelines/pipeline_latent_diffusion.py
index cd7f653bf4..7d386765d4 100644
--- a/src/diffusers/pipelines/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py
@@ -860,7 +860,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
         return dec, posterior
 
 
-class LatentDiffusion(DiffusionPipeline):
+class LatentDiffusionPipeline(DiffusionPipeline):
     def __init__(self, vqvae, bert, tokenizer, unet, noise_scheduler):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
diff --git a/src/diffusers/pipelines/pipeline_pndm.py b/src/diffusers/pipelines/pipeline_pndm.py
index a19f933ed1..5fd8a98483 100644
--- a/src/diffusers/pipelines/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pipeline_pndm.py
@@ -21,7 +21,7 @@ import tqdm
 from ..pipeline_utils import DiffusionPipeline
 
 
-class PNDM(DiffusionPipeline):
+class PNDMPipeline(DiffusionPipeline):
     def __init__(self, unet, noise_scheduler):
         super().__init__()
         noise_scheduler = noise_scheduler.set_format("pt")
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 372435de9d..720e68741f 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -22,17 +22,17 @@ import numpy as np
 import torch
 
 from diffusers import (
-    BDDM,
-    DDIM,
-    DDPM,
-    Glide,
-    PNDM,
+    BDDMPipeline,
+    DDIMPipeline,
     DDIMScheduler,
+    DDPMPipeline,
     DDPMScheduler,
+    GlidePipeline,
     GlideSuperResUNetModel,
     GlideTextToImageUNetModel,
-    GradTTS,
-    LatentDiffusion,
+    GradTTSPipeline,
+    LatentDiffusionPipeline,
+    PNDMPipeline,
     PNDMScheduler,
     UNetGradTTSModel,
     UNetLDMModel,
@@ -583,11 +583,11 @@ class PipelineTesterMixin(unittest.TestCase):
         model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32)
         schedular = DDPMScheduler(timesteps=10)
 
-        ddpm = DDPM(model, schedular)
+        ddpm = DDPMPipeline(model, schedular)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPM.from_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
 
         generator = torch.manual_seed(0)
 
@@ -601,7 +601,7 @@ class PipelineTesterMixin(unittest.TestCase):
     def test_from_pretrained_hub(self):
         model_path = "fusing/ddpm-cifar10"
 
-        ddpm = DDPM.from_pretrained(model_path)
+        ddpm = DDPMPipeline.from_pretrained(model_path)
         ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)
 
         ddpm.noise_scheduler.num_timesteps = 10
@@ -624,7 +624,7 @@ class PipelineTesterMixin(unittest.TestCase):
         noise_scheduler = DDPMScheduler.from_config(model_id)
         noise_scheduler = noise_scheduler.set_format("pt")
 
-        ddpm = DDPM(unet=unet, noise_scheduler=noise_scheduler)
+        ddpm = DDPMPipeline(unet=unet, noise_scheduler=noise_scheduler)
         image = ddpm(generator=generator)
 
         image_slice = image[0, -1, -3:, -3:].cpu()
@@ -641,7 +641,7 @@ class PipelineTesterMixin(unittest.TestCase):
         unet = UNetModel.from_pretrained(model_id)
         noise_scheduler = DDIMScheduler(tensor_format="pt")
 
-        ddim = DDIM(unet=unet, noise_scheduler=noise_scheduler)
+        ddim = DDIMPipeline(unet=unet, noise_scheduler=noise_scheduler)
         image = ddim(generator=generator, eta=0.0)
 
         image_slice = image[0, -1, -3:, -3:].cpu()
@@ -660,7 +660,7 @@ class PipelineTesterMixin(unittest.TestCase):
         unet = UNetModel.from_pretrained(model_id)
         noise_scheduler = PNDMScheduler(tensor_format="pt")
 
-        pndm = PNDM(unet=unet, noise_scheduler=noise_scheduler)
+        pndm = PNDMPipeline(unet=unet, noise_scheduler=noise_scheduler)
         image = pndm(generator=generator)
 
         image_slice = image[0, -1, -3:, -3:].cpu()
@@ -674,7 +674,7 @@ class PipelineTesterMixin(unittest.TestCase):
     @slow
     def test_ldm_text2img(self):
         model_id = "fusing/latent-diffusion-text2im-large"
-        ldm = LatentDiffusion.from_pretrained(model_id)
+        ldm = LatentDiffusionPipeline.from_pretrained(model_id)
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.manual_seed(0)
@@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase):
     @slow
     def test_glide_text2img(self):
         model_id = "fusing/glide-base"
-        glide = Glide.from_pretrained(model_id)
+        glide = GlidePipeline.from_pretrained(model_id)
 
         prompt = "a pencil sketch of a corgi"
         generator = torch.manual_seed(0)
@@ -704,22 +704,25 @@ class PipelineTesterMixin(unittest.TestCase):
     @slow
     def test_grad_tts(self):
         model_id = "fusing/grad-tts-libri-tts"
-        grad_tts = GradTTS.from_pretrained(model_id)
+        grad_tts = GradTTSPipeline.from_pretrained(model_id)
 
         text = "Hello world, I missed you so much."
+        generator = torch.manual_seed(0)
 
         # generate mel spectograms using text
-        mel_spec = grad_tts(text)
+        mel_spec = grad_tts(text, generator=generator)
 
-        assert mel_spec.shape == (1, 256, 256, 3)
-        expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784])
-        assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2
+        assert mel_spec.shape == (1, 80, 143)
+        expected_slice = torch.tensor(
+            [-6.6119, -6.5963, -6.2776, -6.7496, -6.7096, -6.5131, -6.4643, -6.4817, -6.7185]
+        )
+        assert (mel_spec[0, :3, :3].flatten() - expected_slice).abs().max() < 1e-2
 
     def test_module_from_pipeline(self):
         model = DiffWave(num_res_layers=4)
         noise_scheduler = DDPMScheduler(timesteps=12)
 
-        bddm = BDDM(model, noise_scheduler)
+        bddm = BDDMPipeline(model, noise_scheduler)
 
         # check if the library name for the diffwave moduel is set to pipeline module
         self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm")
@@ -727,6 +730,6 @@ class PipelineTesterMixin(unittest.TestCase):
         # check if we can save and load the pipeline
         with tempfile.TemporaryDirectory() as tmpdirname:
             bddm.save_pretrained(tmpdirname)
-            _ = BDDM.from_pretrained(tmpdirname)
+            _ = BDDMPipeline.from_pretrained(tmpdirname)
             # check if the same works using the DifusionPipeline class
             _ = DiffusionPipeline.from_pretrained(tmpdirname)

From 40e28e8bf4165c1167148fb825affd57c53b00ea Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 22 Jun 2022 13:42:09 +0000
Subject: [PATCH 08/10] only remove module if necessary

---
 src/diffusers/pipeline_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 339ebb074a..d73b8d8fb3 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -86,7 +86,7 @@ class DiffusionPipeline(ConfigMixin):
         model_index_dict = dict(self.config)
         model_index_dict.pop("_class_name")
         model_index_dict.pop("_diffusers_version")
-        model_index_dict.pop("_module")
+        model_index_dict.pop("_module", None)
 
         for pipeline_component_name in model_index_dict.keys():
             sub_model = getattr(self, pipeline_component_name)

From 3a17775454b80d2b0bceb0de7ac6b444ff288c75 Mon Sep 17 00:00:00 2001
From: Anton Lozhkov <anton@huggingface.co>
Date: Wed, 22 Jun 2022 17:26:07 +0200
Subject: [PATCH 09/10] TODO: Add FID and KID metrics

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 32dc7c8229..7cb20b0e0e 100644
--- a/README.md
+++ b/README.md
@@ -288,3 +288,4 @@ wavwrite("generated_audio.wav", sampling_rate, audio.squeeze().cpu().numpy())
 - [ ] Add more vision models
 - [ ] Add more speech models
 - [ ] Add RL model
+- [ ] Add FID and KID metrics

From 6e456b7a7afa72543cad6503c91d31c6cb793a3a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 22 Jun 2022 18:38:32 +0200
Subject: [PATCH 10/10] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7cb20b0e0e..6c2c9799c2 100644
--- a/README.md
+++ b/README.md
@@ -253,12 +253,12 @@ image_pil.save("test.png")
 
 ```python
 import torch
-from diffusers import BDDMPipeline, GradTTS
+from diffusers import BDDMPipeline, GradTTSPipeline
 
 torch_device = "cuda"
 
 # load grad tts and bddm pipelines
-grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts")
+grad_tts = GradTTSPipeline.from_pretrained("fusing/grad-tts-libri-tts")
 bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech")
 
 text = "Hello world, I missed you so much."