diff --git a/Makefile b/Makefile
index 83a84fe461..ec8237e15f 100644
--- a/Makefile
+++ b/Makefile
@@ -34,13 +34,9 @@ autogenerate_code: deps_table_update
 # Check that the repo is in a good state
 
 repo-consistency:
-	python utils/check_copies.py
-	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
-	python utils/check_config_docstrings.py
-	python utils/tests_fetcher.py --sanity_check
 
 # this target runs checks on all files
 
@@ -48,14 +44,13 @@ quality:
 	black --check --preview $(check_dirs)
 	isort --check-only $(check_dirs)
 	flake8 $(check_dirs)
-	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
+	doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
 
 # Format source code automatically and check is there are any problems left that need manual fixing
 
 extra_style_checks:
 	python utils/custom_init_isort.py
-	python utils/sort_auto_mappings.py
-	doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
+	doc-builder style src/diffusers docs/source --max_len 119 --path_to_docs docs/source
 
 # this target runs checks on all files and potentially modifies some of them
 
@@ -73,8 +68,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 
 fix-copies:
 	python utils/check_dummies.py --fix_and_overwrite
-	python utils/check_table.py --fix_and_overwrite
-	python utils/check_copies.py --fix_and_overwrite
 
 # Run tests for the library
 
diff --git a/src/diffusers/hub_utils.py b/src/diffusers/hub_utils.py
index c2d1e34f3e..2ab2ff289a 100644
--- a/src/diffusers/hub_utils.py
+++ b/src/diffusers/hub_utils.py
@@ -47,12 +47,11 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 
 def init_git_repo(args, at_init: bool = False):
     """
-    Initializes a git repo in `args.hub_model_id`.
     Args:
+    Initializes a git repo in `args.hub_model_id`.
         at_init (`bool`, *optional*, defaults to `False`):
-            Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
-            `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
-            out.
+            Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
+            and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
     """
     if args.local_rank not in [-1, 0]:
         return
@@ -102,8 +101,8 @@ def push_to_hub(
     **kwargs,
 ) -> str:
     """
-    Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
     Parameters:
+    Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
         commit_message (`str`, *optional*, defaults to `"End of training"`):
             Message to commit while pushing.
         blocking (`bool`, *optional*, defaults to `True`):
@@ -111,8 +110,8 @@ def push_to_hub(
         kwargs:
             Additional keyword arguments passed along to [`create_model_card`].
     Returns:
-        The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of
-        the commit and an object to track the progress of the commit if `blocking=True`
+        The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
+        commit and an object to track the progress of the commit if `blocking=True`
     """
 
     if args.hub_model_id is None:
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 0b3d072b70..aa60ffa936 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -123,16 +123,16 @@ class ModelMixin(torch.nn.Module):
     r"""
     Base class for all models.
 
-    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading,
-    downloading and saving models as well as a few methods common to all models to:
+    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
+    and saving models as well as a few methods common to all models to:
 
         - resize the input embeddings,
         - prune heads in the self-attention heads.
 
     Class attributes (overridden by derived classes):
 
-        - **config_class** ([`ConfigMixin`]) -- A subclass of [`ConfigMixin`] to use as configuration class
-          for this model architecture.
+        - **config_class** ([`ConfigMixin`]) -- A subclass of [`ConfigMixin`] to use as configuration class for this
+          model architecture.
         - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
           taking as arguments:
 
@@ -227,8 +227,8 @@ class ModelMixin(torch.nn.Module):
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                       Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                       user or organization name, like `dbmdz/bert-base-german-cased`.
-                    - A path to a *directory* containing model weights saved using
-                      [`~ModelMixin.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`],
+                      e.g., `./my_model_directory/`.
 
             config (`Union[ConfigMixin, str, os.PathLike]`, *optional*):
                 Can be either:
@@ -236,13 +236,13 @@ class ModelMixin(torch.nn.Module):
                     - an instance of a class derived from [`ConfigMixin`],
                     - a string or path valid as input to [`~ConfigMixin.from_pretrained`].
 
-                ConfigMixinuration for the model to use instead of an automatically loaded configuration. ConfigMixinuration can
-                be automatically loaded when:
+                ConfigMixinuration for the model to use instead of an automatically loaded configuration.
+                ConfigMixinuration can be automatically loaded when:
 
                     - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the
-                      save directory.
+                    - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the save
+                      directory.
                     - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                       configuration JSON file named *config.json* is found in the directory.
             cache_dir (`Union[str, os.PathLike]`, *optional*):
@@ -292,10 +292,10 @@ class ModelMixin(torch.nn.Module):
                       underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
                     - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~ConfigMixin.from_pretrained`]). Each key of `kwargs` that
-                      corresponds to a configuration attribute will be used to override said attribute with the
-                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      will be passed to the underlying model's `__init__` function.
+                      initialization function ([`~ConfigMixin.from_pretrained`]). Each key of `kwargs` that corresponds
+                      to a configuration attribute will be used to override said attribute with the supplied `kwargs`
+                      value. Remaining keys that do not correspond to any configuration attribute will be passed to the
+                      underlying model's `__init__` function.
 
         <Tip>
 
diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
index f31b64ee5c..e70f39319e 100644
--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -22,14 +22,12 @@ def get_timestep_embedding(
     timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, scale=1, max_period=10000
 ):
     """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
-    Create sinusoidal timestep embeddings.
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
 
     :param timesteps: a 1-D Tensor of N indices, one per batch element.
                       These may be fractional.
-    :param embedding_dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
     """
     assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index 9e5ef17641..6560d34559 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -58,9 +58,8 @@ class Upsample(nn.Module):
     """
     An upsampling layer with an optional convolution.
 
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                  upsampling occurs in the inner-two dimensions.
     """
 
@@ -97,9 +96,8 @@ class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
 
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                  downsampling occurs in the inner-two dimensions.
     """
 
@@ -143,9 +141,8 @@ class GlideUpsample(nn.Module):
     """
     An upsampling layer with an optional convolution.
 
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                  upsampling occurs in the inner-two dimensions.
     """
 
@@ -171,10 +168,9 @@ class GlideUpsample(nn.Module):
 
 class LDMUpsample(nn.Module):
     """
-    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    An upsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param
+    use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D.
+    If 3D, then
                  upsampling occurs in the inner-two dimensions.
     """
 
diff --git a/src/diffusers/models/unet_glide.py b/src/diffusers/models/unet_glide.py
index 9a50b9cb52..d357d0cc8a 100644
--- a/src/diffusers/models/unet_glide.py
+++ b/src/diffusers/models/unet_glide.py
@@ -82,8 +82,7 @@ def normalization(channels, swish=0.0):
     """
     Make a standard normalization layer, with an optional swish activation.
 
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
+    :param channels: number of input channels. :return: an nn.Module for normalization.
     """
     return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)
 
@@ -111,8 +110,7 @@ class TimestepBlock(nn.Module):
 
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     """
-    A sequential module that passes timestep embeddings to the children that
-    support it as an extra input.
+    A sequential module that passes timestep embeddings to the children that support it as an extra input.
     """
 
     def forward(self, x, emb, encoder_out=None):
@@ -130,9 +128,8 @@ class Downsample(nn.Module):
     """
     A downsampling layer with an optional convolution.
 
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                  downsampling occurs in the inner-two dimensions.
     """
 
@@ -158,17 +155,13 @@ class ResBlock(TimestepBlock):
     """
     A residual block that can optionally change the number of channels.
 
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
+    :param channels: the number of input channels. :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout. :param out_channels: if specified, the number of out channels. :param
+    use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param use_checkpoint: if True, use gradient checkpointing
+    on this module. :param up: if True, use this block for upsampling. :param down: if True, use this block for
+    downsampling.
     """
 
     def __init__(
@@ -235,8 +228,7 @@ class ResBlock(TimestepBlock):
         """
         Apply the block to a Tensor, conditioned on a timestep embedding.
 
-        :param x: an [N x C x ...] Tensor of features.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :param x: an [N x C x ...] Tensor of features. :param emb: an [N x emb_channels] Tensor of timestep embeddings.
         :return: an [N x C x ...] Tensor of outputs.
         """
         if self.updown:
@@ -320,8 +312,8 @@ class QKVAttention(nn.Module):
         """
         Apply QKV attention.
 
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x T] tensor after
+        attention.
         """
         bs, width, length = qkv.shape
         assert width % (3 * self.n_heads) == 0
@@ -343,29 +335,24 @@ class GlideUNetModel(ModelMixin, ConfigMixin):
     """
     The full UNet model with attention and timestep embedding.
 
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
+    :param in_channels: channels in the input Tensor. :param model_channels: base channel count for the model. :param
+    out_channels: channels in the output Tensor. :param num_res_blocks: number of residual blocks per downsample.
     :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
+        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
+        downsampling, attention will be used.
+    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
+    conv_resample: if True, use learned convolutions for upsampling and
         downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
+    model will be
         class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
+    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
                                a fixed channel width per attention head.
     :param num_heads_upsample: works with num_heads to set a different number
                                of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
+    for up/downsampling.
     """
 
     def __init__(
@@ -571,10 +558,8 @@ class GlideUNetModel(ModelMixin, ConfigMixin):
         """
         Apply the model to an input batch.
 
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
+        :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch of timesteps. :param y: an [N]
+        Tensor of labels, if class-conditional. :return: an [N x C x ...] Tensor of outputs.
         """
 
         hs = []
diff --git a/src/diffusers/models/unet_ldm.py b/src/diffusers/models/unet_ldm.py
index 378fdd57a2..f8a8602d2f 100644
--- a/src/diffusers/models/unet_ldm.py
+++ b/src/diffusers/models/unet_ldm.py
@@ -222,11 +222,8 @@ class BasicTransformerBlock(nn.Module):
 
 class SpatialTransformer(nn.Module):
     """
-    Transformer block for image-like data.
-    First, project the input (aka embedding)
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
+    Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
+    standard transformer action. Finally, reshape to image
     """
 
     def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None):
@@ -331,8 +328,7 @@ def normalization(channels, swish=0.0):
     """
     Make a standard normalization layer, with an optional swish activation.
 
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
+    :param channels: number of input channels. :return: an nn.Module for normalization.
     """
     return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)
 
@@ -382,8 +378,7 @@ class TimestepBlock(nn.Module):
 
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
     """
-    A sequential module that passes timestep embeddings to the children that
-    support it as an extra input.
+    A sequential module that passes timestep embeddings to the children that support it as an extra input.
     """
 
     def forward(self, x, emb, context=None):
@@ -399,10 +394,9 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
 
 class Downsample(nn.Module):
     """
-    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
-    :param use_conv: a bool determining if a convolution is applied.
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+    A downsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param
+    use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D.
+    If 3D, then
                  downsampling occurs in the inner-two dimensions.
     """
 
@@ -426,18 +420,14 @@ class Downsample(nn.Module):
 
 class ResBlock(TimestepBlock):
     """
-    A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
-    :param emb_channels: the number of timestep embedding channels.
-    :param dropout: the rate of dropout.
-    :param out_channels: if specified, the number of out channels.
-    :param use_conv: if True and out_channels is specified, use a spatial
-        convolution instead of a smaller 1x1 convolution to change the
-        channels in the skip connection.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
+    A residual block that can optionally change the number of channels. :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels. :param dropout: the rate of dropout. :param
+    out_channels: if specified, the number of out channels. :param use_conv: if True and out_channels is specified, use
+    a spatial
+        convolution instead of a smaller 1x1 convolution to change the channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param use_checkpoint: if True, use gradient checkpointing
+    on this module. :param up: if True, use this block for upsampling. :param down: if True, use this block for
+    downsampling.
     """
 
     def __init__(
@@ -525,8 +515,8 @@ class ResBlock(TimestepBlock):
 
 class AttentionBlock(nn.Module):
     """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
     https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
     """
 
@@ -575,9 +565,8 @@ class QKVAttention(nn.Module):
 
     def forward(self, qkv):
         """
-        Apply QKV attention.
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
+        Apply QKV attention. :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x
+        T] tensor after attention.
         """
         bs, width, length = qkv.shape
         assert width % (3 * self.n_heads) == 0
@@ -600,13 +589,9 @@ class QKVAttention(nn.Module):
 
 def count_flops_attn(model, _x, y):
     """
-    A counter for the `thop` package to count the operations in an
-    attention operation.
-    Meant to be used like:
+    A counter for the `thop` package to count the operations in an attention operation. Meant to be used like:
         macs, params = thop.profile(
-            model,
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
+            model, inputs=(inputs, timestamps), custom_ops={QKVAttention: QKVAttention.count_flops},
         )
     """
     b, c, *spatial = y[0].shape
@@ -629,9 +614,8 @@ class QKVAttentionLegacy(nn.Module):
 
     def forward(self, qkv):
         """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
+        Apply QKV attention. :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x
+        T] tensor after attention.
         """
         bs, width, length = qkv.shape
         assert width % (3 * self.n_heads) == 0
@@ -650,31 +634,25 @@ class QKVAttentionLegacy(nn.Module):
 
 class UNetLDMModel(ModelMixin, ConfigMixin):
     """
-    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
-    :param model_channels: base channel count for the model.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
-    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
-        For example, if this contains 4, then at 4x downsampling, attention
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
+    The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
+    model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
+    num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample
+    rates at which
+        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
+        downsampling, attention will be used.
+    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
+    conv_resample: if True, use learned convolutions for upsampling and
         downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param num_classes: if specified (as an int), then this model will be
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
+    model will be
         class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
-    :param num_heads: the number of attention heads in each attention layer.
-    :param num_heads_channels: if specified, ignore num_heads and instead use
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
+    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
                                a fixed channel width per attention head.
     :param num_heads_upsample: works with num_heads to set a different number
                                of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
-    :param resblock_updown: use residual blocks for up/downsampling.
-    :param use_new_attention_order: use a different attention pattern for potentially
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
+    for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially
                                     increased efficiency.
     """
 
@@ -975,12 +953,9 @@ class UNetLDMModel(ModelMixin, ConfigMixin):
 
     def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
         """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
+        Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch
+        of timesteps. :param context: conditioning plugged in via crossattn :param y: an [N] Tensor of labels, if
+        class-conditional. :return: an [N x C x ...] Tensor of outputs.
         """
         assert (y is not None) == (
             self.num_classes is not None
@@ -1012,8 +987,7 @@ class UNetLDMModel(ModelMixin, ConfigMixin):
 
 class EncoderUNetModel(nn.Module):
     """
-    The half UNet model with attention and timestep embedding.
-    For usage, see UNet.
+    The half UNet model with attention and timestep embedding. For usage, see UNet.
     """
 
     def __init__(
@@ -1197,10 +1171,8 @@ class EncoderUNetModel(nn.Module):
 
     def forward(self, x, timesteps):
         """
-        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
+        Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch
+        of timesteps. :return: an [N x K] Tensor of outputs.
         """
         emb = self.time_embed(
             get_timestep_embedding(timesteps, self.model_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
diff --git a/src/diffusers/models/unet_rl.py b/src/diffusers/models/unet_rl.py
index a0b8c5e47a..9c0c77130c 100644
--- a/src/diffusers/models/unet_rl.py
+++ b/src/diffusers/models/unet_rl.py
@@ -111,10 +111,8 @@ class ResidualTemporalBlock(nn.Module):
 
     def forward(self, x, t):
         """
-        x : [ batch_size x inp_channels x horizon ]
-        t : [ batch_size x embed_dim ]
-        returns:
-        out : [ batch_size x out_channels x horizon ]
+        x : [ batch_size x inp_channels x horizon ] t : [ batch_size x embed_dim ] returns: out : [ batch_size x
+        out_channels x horizon ]
         """
         out = self.blocks[0](x) + self.time_mlp(t)
         out = self.blocks[1](out)
diff --git a/src/diffusers/models/unet_sde_score_estimation.py b/src/diffusers/models/unet_sde_score_estimation.py
index 83700c4b63..44c635922d 100644
--- a/src/diffusers/models/unet_sde_score_estimation.py
+++ b/src/diffusers/models/unet_sde_score_estimation.py
@@ -136,26 +136,21 @@ def naive_downsample_2d(x, factor=2):
 def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
     """Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
 
-    Padding is performed only once at the beginning, not between the
-    operations.
-    The fused op is considerably more efficient than performing the same
-    calculation
-    using standard TensorFlow ops. It supports gradients of arbitrary order.
     Args:
-      x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+    efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary
+    order.
+      x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
         C]`.
-      w:            Weight tensor of the shape `[filterH, filterW, inChannels,
-        outChannels]`. Grouped convolution can be performed by `inChannels =
-        x.shape[0] // numGroups`.
-      k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-        (separable). The default is `[1] * factor`, which corresponds to
-        nearest-neighbor upsampling.
-      factor:       Integer upsampling factor (default: 2).
-      gain:         Scaling factor for signal magnitude (default: 1.0).
+      w: Weight tensor of the shape `[filterH, filterW, inChannels,
+        outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+      k: FIR filter of the shape `[firH, firW]` or `[firN]`
+        (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
+      factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
 
     Returns:
-      Tensor of the shape `[N, C, H * factor, W * factor]` or
-      `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+      Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as
+      `x`.
     """
 
     assert isinstance(factor, int) and factor >= 1
@@ -208,25 +203,21 @@ def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
 def conv_downsample_2d(x, w, k=None, factor=2, gain=1):
     """Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
 
-    Padding is performed only once at the beginning, not between the operations.
-    The fused op is considerably more efficient than performing the same
-    calculation
-    using standard TensorFlow ops. It supports gradients of arbitrary order.
     Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+    efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary
+    order.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
           C]`.
-        w:            Weight tensor of the shape `[filterH, filterW, inChannels,
-          outChannels]`. Grouped convolution can be performed by `inChannels =
-          x.shape[0] // numGroups`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          average pooling.
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
+        w: Weight tensor of the shape `[filterH, filterW, inChannels,
+          outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to average pooling.
+        factor: Integer downsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
 
     Returns:
-        Tensor of the shape `[N, C, H // factor, W // factor]` or
-        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+        Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same datatype
+        as `x`.
     """
 
     assert isinstance(factor, int) and factor >= 1
@@ -258,22 +249,16 @@ def _shape(x, dim):
 def upsample_2d(x, k=None, factor=2, gain=1):
     r"""Upsample a batch of 2D images with the given filter.
 
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and upsamples each image with the given filter. The filter is normalized so
-    that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the upsampling factor.
     Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
+    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
+    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is a:
+    multiple of the upsampling factor.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
           C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          nearest-neighbor upsampling.
-        factor:       Integer upsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
+        factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
 
     Returns:
         Tensor of the shape `[N, C, H * factor, W * factor]`
@@ -289,22 +274,16 @@ def upsample_2d(x, k=None, factor=2, gain=1):
 def downsample_2d(x, k=None, factor=2, gain=1):
     r"""Downsample a batch of 2D images with the given filter.
 
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and downsamples each image with the given filter. The filter is normalized
-    so that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the downsampling factor.
     Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+    shape is a multiple of the downsampling factor.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
           C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          average pooling.
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to average pooling.
+        factor: Integer downsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
 
     Returns:
         Tensor of the shape `[N, C, H // factor, W // factor]`
diff --git a/src/diffusers/pipelines/grad_tts_utils.py b/src/diffusers/pipelines/grad_tts_utils.py
index 15995b85c8..f36f31c5a9 100644
--- a/src/diffusers/pipelines/grad_tts_utils.py
+++ b/src/diffusers/pipelines/grad_tts_utils.py
@@ -290,7 +290,7 @@ def normalize_numbers(text):
     return text
 
 
-""" from https://github.com/keithito/tacotron """
+""" from https://github.com/keithito/tacotron"""
 
 
 _pad = "_"
@@ -322,8 +322,8 @@ def get_arpabet(word, dictionary):
 def text_to_sequence(text, cleaner_names=[english_cleaners], dictionary=None):
     """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on
+    {HH AW1 S S T AH0 N} Street."
 
     Args:
       text: string to convert to a sequence
diff --git a/src/diffusers/pipelines/pipeline_bddm.py b/src/diffusers/pipelines/pipeline_bddm.py
index 8b24cb9ceb..09120fdab0 100644
--- a/src/diffusers/pipelines/pipeline_bddm.py
+++ b/src/diffusers/pipelines/pipeline_bddm.py
@@ -29,8 +29,7 @@ from ..pipeline_utils import DiffusionPipeline
 def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
     """
     Embed a diffusion step $t$ into a higher dimensional space
-        E.g. the embedding vector in the 128-dimensional space is
-        [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),
+        E.g. the embedding vector in the 128-dimensional space is [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),
          cos(t * 10^(0*4/63)), ... , cos(t * 10^(63*4/63))]
 
     Parameters:
@@ -53,8 +52,7 @@ def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
 
 
 """
-Below scripts were borrowed from
-https://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py
+Below scripts were borrowed from https://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py
 """
 
 
diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py
index 8680b7542a..9a67790b35 100644
--- a/src/diffusers/pipelines/pipeline_glide.py
+++ b/src/diffusers/pipelines/pipeline_glide.py
@@ -699,9 +699,8 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
     """
     Extract values from a 1-D numpy array for a batch of indices.
 
-    :param arr: the 1-D numpy array.
-    :param timesteps: a tensor of indices into the array to extract.
-    :param broadcast_shape: a larger shape of K dimensions with the batch
+    :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param
+    broadcast_shape: a larger shape of K dimensions with the batch
                             dimension equal to the length of timesteps.
     :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
     """
diff --git a/src/diffusers/pipelines/pipeline_grad_tts.py b/src/diffusers/pipelines/pipeline_grad_tts.py
index 743104e658..93770fe21e 100644
--- a/src/diffusers/pipelines/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/pipeline_grad_tts.py
@@ -1,4 +1,4 @@
-""" from https://github.com/jaywalnut310/glow-tts """
+""" from https://github.com/jaywalnut310/glow-tts"""
 
 import math
 
diff --git a/src/diffusers/pipelines/pipeline_latent_diffusion.py b/src/diffusers/pipelines/pipeline_latent_diffusion.py
index ffc8ae670c..fea7a287ed 100644
--- a/src/diffusers/pipelines/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py
@@ -554,11 +554,9 @@ class LDMBertModel(LDMBertPreTrainedModel):
 
 def get_timestep_embedding(timesteps, embedding_dim):
     """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
-    From Fairseq.
-    Build sinusoidal embeddings.
-    This matches the implementation in tensor2tensor, but differs slightly
-    from the description in Section 3.5 of "Attention Is All You Need".
+    This matches the implementation in Denoising Diffusion Probabilistic Models: From Fairseq. Build sinusoidal
+    embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section
+    3.5 of "Attention Is All You Need".
     """
     assert len(timesteps.shape) == 1
 
@@ -1055,8 +1053,8 @@ class Decoder(nn.Module):
 
 class VectorQuantizer(nn.Module):
     """
-    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
-    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
+    multiplications and allows for post-hoc remapping of indices.
     """
 
     # NOTE: due to a bug the beta term was applied to the wrong term. for
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index d11af4ec25..f626cb1ca5 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -25,13 +25,12 @@ from .scheduling_utils import SchedulerMixin
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
 
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
+    from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
     :param max_beta: the maximum beta to use; use values lower than 1 to
                      prevent singularities.
     """
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index d908850dfe..d4230ff069 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -25,13 +25,12 @@ from .scheduling_utils import SchedulerMixin
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
 
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
+    from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
     :param max_beta: the maximum beta to use; use values lower than 1 to
                      prevent singularities.
     """
diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py
index e7479d5497..8533ad6cd7 100644
--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -24,13 +24,12 @@ from .scheduling_utils import SchedulerMixin
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
     """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
 
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
+    from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
     :param max_beta: the maximum beta to use; use values lower than 1 to
                      prevent singularities.
     """
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 99fecaa07f..f81bf5cc03 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -20,11 +20,10 @@ class EMAModel:
     ):
         """
         @crowsonkb's notes on EMA Warmup:
-            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are
-            good values for models you plan to train for a million or more steps (reaches decay
-            factor 0.999 at 31.6K steps, 0.9999 at 1M steps), gamma=1, power=3/4 for models
-            you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
-            215.4k steps).
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
         Args:
             inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
             power (float): Exponential factor of EMA warmup. Default: 2/3.
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 470526a8b5..2c56ba4a8a 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -89,20 +89,20 @@ class RevisionNotFoundError(HTTPError):
 
 
 TRANSFORMERS_IMPORT_ERROR = """
-{0} requires the transformers library but it was not found in your environment. You can install it with pip:
-`pip install transformers`
+{0} requires the transformers library but it was not found in your environment. You can install it with pip: `pip
+install transformers`
 """
 
 
 UNIDECODE_IMPORT_ERROR = """
-{0} requires the unidecode library but it was not found in your environment. You can install it with pip:
-`pip install Unidecode`
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
+Unidecode`
 """
 
 
 INFLECT_IMPORT_ERROR = """
-{0} requires the inflect library but it was not found in your environment. You can install it with pip:
-`pip install inflect`
+{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
+inflect`
 """
 
 
diff --git a/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
index 320a93134a..8c2aec218c 100644
--- a/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
+++ b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
@@ -3,7 +3,7 @@
 from ..utils import DummyObject, requires_backends
 
 
-class GradTTS(metaclass=DummyObject):
+class GradTTSPipeline(metaclass=DummyObject):
     _backends = ["transformers", "inflect", "unidecode"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/diffusers/utils/dummy_transformers_objects.py b/src/diffusers/utils/dummy_transformers_objects.py
index 1efb17297f..ac34367a3b 100644
--- a/src/diffusers/utils/dummy_transformers_objects.py
+++ b/src/diffusers/utils/dummy_transformers_objects.py
@@ -31,14 +31,14 @@ class UNetGradTTSModel(metaclass=DummyObject):
         requires_backends(self, ["transformers"])
 
 
-class Glide(metaclass=DummyObject):
+class GlidePipeline(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["transformers"])
 
 
-class LatentDiffusion(metaclass=DummyObject):
+class LatentDiffusionPipeline(metaclass=DummyObject):
     _backends = ["transformers"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/diffusers/utils/logging.py b/src/diffusers/utils/logging.py
index 63027f3267..1f2d0227b8 100644
--- a/src/diffusers/utils/logging.py
+++ b/src/diffusers/utils/logging.py
@@ -233,8 +233,8 @@ def disable_propagation() -> None:
 
 def enable_propagation() -> None:
     """
-    Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to
-    prevent double logging if the root logger has been configured.
+    Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to prevent
+    double logging if the root logger has been configured.
     """
 
     _configure_library_root_logger()
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index aa40513621..453b4fa285 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -22,7 +22,6 @@ import numpy as np
 import torch
 
 from diffusers import (
-    GradTTSPipeline,
     BDDMPipeline,
     DDIMPipeline,
     DDIMScheduler,
@@ -31,6 +30,7 @@ from diffusers import (
     GlidePipeline,
     GlideSuperResUNetModel,
     GlideTextToImageUNetModel,
+    GradTTSPipeline,
     GradTTSScheduler,
     LatentDiffusionPipeline,
     NCSNpp,
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 7565bfa51b..50f02cac65 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -24,7 +24,7 @@ from doc_builder.style_doc import style_docstrings_in_code
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_copies.py
-TRANSFORMERS_PATH = "src/transformers"
+TRANSFORMERS_PATH = "src/diffusers"
 PATH_TO_DOCS = "docs/source/en"
 REPO_PATH = "."
 
@@ -76,7 +76,7 @@ def _should_continue(line, indent):
     return line.startswith(indent) or len(line) <= 1 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
 
 
-def find_code_in_transformers(object_name):
+def find_code_in_diffusers(object_name):
     """Find and return the code source code of `object_name`."""
     parts = object_name.split(".")
     i = 0
@@ -88,9 +88,7 @@ def find_code_in_transformers(object_name):
         if i < len(parts):
             module = os.path.join(module, parts[i])
     if i >= len(parts):
-        raise ValueError(
-            f"`object_name` should begin with the name of a module of transformers but got {object_name}."
-        )
+        raise ValueError(f"`object_name` should begin with the name of a module of diffusers but got {object_name}.")
 
     with open(os.path.join(TRANSFORMERS_PATH, f"{module}.py"), "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
@@ -121,7 +119,7 @@ def find_code_in_transformers(object_name):
     return "".join(code_lines)
 
 
-_re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+transformers\.(\S+\.\S+)\s*($|\S.*$)")
+_re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+diffusers\.(\S+\.\S+)\s*($|\S.*$)")
 _re_replace_pattern = re.compile(r"^\s*(\S+)->(\S+)(\s+.*|$)")
 
 
@@ -167,7 +165,7 @@ def is_copy_consistent(filename, overwrite=False):
 
         # There is some copied code here, let's retrieve the original.
         indent, object_name, replace_pattern = search.groups()
-        theoretical_code = find_code_in_transformers(object_name)
+        theoretical_code = find_code_in_diffusers(object_name)
         theoretical_indent = get_indent(theoretical_code)
 
         start_index = line_index + 1 if indent == theoretical_indent else line_index + 2
@@ -235,7 +233,9 @@ def check_copies(overwrite: bool = False):
             + diff
             + "\nRun `make fix-copies` or `python utils/check_copies.py --fix_and_overwrite` to fix them."
         )
-    check_model_list_copy(overwrite=overwrite)
+
+
+#    check_model_list_copy(overwrite=overwrite)
 
 
 def check_full_copies(overwrite: bool = False):
@@ -348,8 +348,8 @@ def convert_to_localized_md(model_list, localized_model_list, format_str):
 
 
 def convert_readme_to_index(model_list):
-    model_list = model_list.replace("https://huggingface.co/docs/transformers/main/", "")
-    return model_list.replace("https://huggingface.co/docs/transformers/", "")
+    model_list = model_list.replace("https://huggingface.co/docs/diffusers/main/", "")
+    return model_list.replace("https://huggingface.co/docs/diffusers/", "")
 
 
 def _find_text_in_file(filename, start_prompt, end_prompt):
@@ -383,9 +383,9 @@ def check_model_list_copy(overwrite=False, max_per_line=119):
     # Fix potential doc links in the README
     with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f:
         readme = f.read()
-    new_readme = readme.replace("https://huggingface.co/transformers", "https://huggingface.co/docs/transformers")
+    new_readme = readme.replace("https://huggingface.co/diffusers", "https://huggingface.co/docs/diffusers")
     new_readme = new_readme.replace(
-        "https://huggingface.co/docs/main/transformers", "https://huggingface.co/docs/transformers/main"
+        "https://huggingface.co/docs/main/diffusers", "https://huggingface.co/docs/diffusers/main"
     )
     if new_readme != readme:
         if overwrite:
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
new file mode 100644
index 0000000000..6501654872
--- /dev/null
+++ b/utils/custom_init_isort.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import re
+
+
+PATH_TO_TRANSFORMERS = "src/diffusers"
+
+# Pattern that looks at the indentation in a line.
+_re_indent = re.compile(r"^(\s*)\S")
+# Pattern that matches `"key":" and puts `key` in group 0.
+_re_direct_key = re.compile(r'^\s*"([^"]+)":')
+# Pattern that matches `_import_structure["key"]` and puts `key` in group 0.
+_re_indirect_key = re.compile(r'^\s*_import_structure\["([^"]+)"\]')
+# Pattern that matches `"key",` and puts `key` in group 0.
+_re_strip_line = re.compile(r'^\s*"([^"]+)",\s*$')
+# Pattern that matches any `[stuff]` and puts `stuff` in group 0.
+_re_bracket_content = re.compile(r"\[([^\]]+)\]")
+
+
+def get_indent(line):
+    """Returns the indent in `line`."""
+    search = _re_indent.search(line)
+    return "" if search is None else search.groups()[0]
+
+
+def split_code_in_indented_blocks(code, indent_level="", start_prompt=None, end_prompt=None):
+    """
+    Split `code` into its indented blocks, starting at `indent_level`. If provided, begins splitting after
+    `start_prompt` and stops at `end_prompt` (but returns what's before `start_prompt` as a first block and what's
+    after `end_prompt` as a last block, so `code` is always the same as joining the result of this function).
+    """
+    # Let's split the code into lines and move to start_index.
+    index = 0
+    lines = code.split("\n")
+    if start_prompt is not None:
+        while not lines[index].startswith(start_prompt):
+            index += 1
+        blocks = ["\n".join(lines[:index])]
+    else:
+        blocks = []
+
+    # We split into blocks until we get to the `end_prompt` (or the end of the block).
+    current_block = [lines[index]]
+    index += 1
+    while index < len(lines) and (end_prompt is None or not lines[index].startswith(end_prompt)):
+        if len(lines[index]) > 0 and get_indent(lines[index]) == indent_level:
+            if len(current_block) > 0 and get_indent(current_block[-1]).startswith(indent_level + " "):
+                current_block.append(lines[index])
+                blocks.append("\n".join(current_block))
+                if index < len(lines) - 1:
+                    current_block = [lines[index + 1]]
+                    index += 1
+                else:
+                    current_block = []
+            else:
+                blocks.append("\n".join(current_block))
+                current_block = [lines[index]]
+        else:
+            current_block.append(lines[index])
+        index += 1
+
+    # Adds current block if it's nonempty.
+    if len(current_block) > 0:
+        blocks.append("\n".join(current_block))
+
+    # Add final block after end_prompt if provided.
+    if end_prompt is not None and index < len(lines):
+        blocks.append("\n".join(lines[index:]))
+
+    return blocks
+
+
+def ignore_underscore(key):
+    "Wraps a `key` (that maps an object to string) to lower case and remove underscores."
+
+    def _inner(x):
+        return key(x).lower().replace("_", "")
+
+    return _inner
+
+
+def sort_objects(objects, key=None):
+    "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+    # If no key is provided, we use a noop.
+    def noop(x):
+        return x
+
+    if key is None:
+        key = noop
+    # Constants are all uppercase, they go first.
+    constants = [obj for obj in objects if key(obj).isupper()]
+    # Classes are not all uppercase but start with a capital, they go second.
+    classes = [obj for obj in objects if key(obj)[0].isupper() and not key(obj).isupper()]
+    # Functions begin with a lowercase, they go last.
+    functions = [obj for obj in objects if not key(obj)[0].isupper()]
+
+    key1 = ignore_underscore(key)
+    return sorted(constants, key=key1) + sorted(classes, key=key1) + sorted(functions, key=key1)
+
+
+def sort_objects_in_import(import_statement):
+    """
+    Return the same `import_statement` but with objects properly sorted.
+    """
+    # This inner function sort imports between [ ].
+    def _replace(match):
+        imports = match.groups()[0]
+        if "," not in imports:
+            return f"[{imports}]"
+        keys = [part.strip().replace('"', "") for part in imports.split(",")]
+        # We will have a final empty element if the line finished with a comma.
+        if len(keys[-1]) == 0:
+            keys = keys[:-1]
+        return "[" + ", ".join([f'"{k}"' for k in sort_objects(keys)]) + "]"
+
+    lines = import_statement.split("\n")
+    if len(lines) > 3:
+        # Here we have to sort internal imports that are on several lines (one per name):
+        # key: [
+        #     "object1",
+        #     "object2",
+        #     ...
+        # ]
+
+        # We may have to ignore one or two lines on each side.
+        idx = 2 if lines[1].strip() == "[" else 1
+        keys_to_sort = [(i, _re_strip_line.search(line).groups()[0]) for i, line in enumerate(lines[idx:-idx])]
+        sorted_indices = sort_objects(keys_to_sort, key=lambda x: x[1])
+        sorted_lines = [lines[x[0] + idx] for x in sorted_indices]
+        return "\n".join(lines[:idx] + sorted_lines + lines[-idx:])
+    elif len(lines) == 3:
+        # Here we have to sort internal imports that are on one separate line:
+        # key: [
+        #     "object1", "object2", ...
+        # ]
+        if _re_bracket_content.search(lines[1]) is not None:
+            lines[1] = _re_bracket_content.sub(_replace, lines[1])
+        else:
+            keys = [part.strip().replace('"', "") for part in lines[1].split(",")]
+            # We will have a final empty element if the line finished with a comma.
+            if len(keys[-1]) == 0:
+                keys = keys[:-1]
+            lines[1] = get_indent(lines[1]) + ", ".join([f'"{k}"' for k in sort_objects(keys)])
+        return "\n".join(lines)
+    else:
+        # Finally we have to deal with imports fitting on one line
+        import_statement = _re_bracket_content.sub(_replace, import_statement)
+        return import_statement
+
+
+def sort_imports(file, check_only=True):
+    """
+    Sort `_import_structure` imports in `file`, `check_only` determines if we only check or overwrite.
+    """
+    with open(file, "r") as f:
+        code = f.read()
+
+    if "_import_structure" not in code:
+        return
+
+    # Blocks of indent level 0
+    main_blocks = split_code_in_indented_blocks(
+        code, start_prompt="_import_structure = {", end_prompt="if TYPE_CHECKING:"
+    )
+
+    # We ignore block 0 (everything untils start_prompt) and the last block (everything after end_prompt).
+    for block_idx in range(1, len(main_blocks) - 1):
+        # Check if the block contains some `_import_structure`s thingy to sort.
+        block = main_blocks[block_idx]
+        block_lines = block.split("\n")
+
+        # Get to the start of the imports.
+        line_idx = 0
+        while line_idx < len(block_lines) and "_import_structure" not in block_lines[line_idx]:
+            # Skip dummy import blocks
+            if "import dummy" in block_lines[line_idx]:
+                line_idx = len(block_lines)
+            else:
+                line_idx += 1
+        if line_idx >= len(block_lines):
+            continue
+
+        # Ignore beginning and last line: they don't contain anything.
+        internal_block_code = "\n".join(block_lines[line_idx:-1])
+        indent = get_indent(block_lines[1])
+        # Slit the internal block into blocks of indent level 1.
+        internal_blocks = split_code_in_indented_blocks(internal_block_code, indent_level=indent)
+        # We have two categories of import key: list or _import_structu[key].append/extend
+        pattern = _re_direct_key if "_import_structure" in block_lines[0] else _re_indirect_key
+        # Grab the keys, but there is a trap: some lines are empty or jsut comments.
+        keys = [(pattern.search(b).groups()[0] if pattern.search(b) is not None else None) for b in internal_blocks]
+        # We only sort the lines with a key.
+        keys_to_sort = [(i, key) for i, key in enumerate(keys) if key is not None]
+        sorted_indices = [x[0] for x in sorted(keys_to_sort, key=lambda x: x[1])]
+
+        # We reorder the blocks by leaving empty lines/comments as they were and reorder the rest.
+        count = 0
+        reorderded_blocks = []
+        for i in range(len(internal_blocks)):
+            if keys[i] is None:
+                reorderded_blocks.append(internal_blocks[i])
+            else:
+                block = sort_objects_in_import(internal_blocks[sorted_indices[count]])
+                reorderded_blocks.append(block)
+                count += 1
+
+        # And we put our main block back together with its first and last line.
+        main_blocks[block_idx] = "\n".join(block_lines[:line_idx] + reorderded_blocks + [block_lines[-1]])
+
+    if code != "\n".join(main_blocks):
+        if check_only:
+            return True
+        else:
+            print(f"Overwriting {file}.")
+            with open(file, "w") as f:
+                f.write("\n".join(main_blocks))
+
+
+def sort_imports_in_all_inits(check_only=True):
+    failures = []
+    for root, _, files in os.walk(PATH_TO_TRANSFORMERS):
+        if "__init__.py" in files:
+            result = sort_imports(os.path.join(root, "__init__.py"), check_only=check_only)
+            if result:
+                failures = [os.path.join(root, "__init__.py")]
+    if len(failures) > 0:
+        raise ValueError(f"Would overwrite {len(failures)} files, run `make style`.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--check_only", action="store_true", help="Whether to only check or fix style.")
+    args = parser.parse_args()
+
+    sort_imports_in_all_inits(check_only=args.check_only)