diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 9d919d374a..e8fec35646 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -245,11 +245,13 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a plain
+                tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+                [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is
+                returned.
         """
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
@@ -331,12 +333,13 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a
+                plain tuple.
 
         Returns:
-            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
-                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
-                `tuple` is returned.
+            [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned,
+                otherwise a plain `tuple` is returned.
         """
         overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
index 67540cb7dc..b73202aedb 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -323,11 +323,13 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+                Whether to return a [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] instead of a plain
+                tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+                [`~models.autoencoders.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is
+                returned.
         """
         h = self.encoder(x)
         moments = self.quant_conv(h)
diff --git a/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
index 212c465377..3409549c65 100644
--- a/src/diffusers/models/autoencoders/consistency_decoder_vae.py
+++ b/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -284,13 +284,13 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a plain
-                tuple.
+                Whether to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                instead of a plain tuple.
 
         Returns:
                 The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
-                is returned.
+                [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a
+                plain `tuple` is returned.
         """
         if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
             return self.tiled_encode(x, return_dict=return_dict)
@@ -382,13 +382,13 @@ class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
         Args:
             x (`torch.Tensor`): Input batch of images.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
-                plain tuple.
+                Whether or not to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
-                If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
-                otherwise a plain `tuple` is returned.
+            [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
+                is returned, otherwise a plain `tuple` is returned.
         """
         overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
diff --git a/src/diffusers/models/dual_transformer_2d.py b/src/diffusers/models/dual_transformer_2d.py
deleted file mode 100644
index b8e40f14d5..0000000000
--- a/src/diffusers/models/dual_transformer_2d.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.dual_transformer_2d import DualTransformer2DModel
-
-
-class DualTransformer2DModel(DualTransformer2DModel):
-    deprecation_message = "Importing `DualTransformer2DModel` from `diffusers.models.dual_transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.dual_transformer_2d import DualTransformer2DModel`, instead."
-    deprecate("DualTransformer2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py
deleted file mode 100644
index 328835a953..0000000000
--- a/src/diffusers/models/prior_transformer.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from ..utils import deprecate
-from .transformers.prior_transformer import PriorTransformer, PriorTransformerOutput
-
-
-class PriorTransformerOutput(PriorTransformerOutput):
-    deprecation_message = "Importing `PriorTransformerOutput` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformerOutput`, instead."
-    deprecate("PriorTransformerOutput", "0.29", deprecation_message)
-
-
-class PriorTransformer(PriorTransformer):
-    deprecation_message = "Importing `PriorTransformer` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformer`, instead."
-    deprecate("PriorTransformer", "0.29", deprecation_message)
diff --git a/src/diffusers/models/t5_film_transformer.py b/src/diffusers/models/t5_film_transformer.py
deleted file mode 100644
index 6aa5ff7449..0000000000
--- a/src/diffusers/models/t5_film_transformer.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.t5_film_transformer import (
-    DecoderLayer,
-    NewGELUActivation,
-    T5DenseGatedActDense,
-    T5FilmDecoder,
-    T5FiLMLayer,
-    T5LayerCrossAttention,
-    T5LayerFFCond,
-    T5LayerNorm,
-    T5LayerSelfAttentionCond,
-)
-
-
-class T5FilmDecoder(T5FilmDecoder):
-    deprecation_message = "Importing `T5FilmDecoder` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FilmDecoder`, instead."
-    deprecate("T5FilmDecoder", "0.29", deprecation_message)
-
-
-class DecoderLayer(DecoderLayer):
-    deprecation_message = "Importing `DecoderLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import DecoderLayer`, instead."
-    deprecate("DecoderLayer", "0.29", deprecation_message)
-
-
-class T5LayerSelfAttentionCond(T5LayerSelfAttentionCond):
-    deprecation_message = "Importing `T5LayerSelfAttentionCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerSelfAttentionCond`, instead."
-    deprecate("T5LayerSelfAttentionCond", "0.29", deprecation_message)
-
-
-class T5LayerCrossAttention(T5LayerCrossAttention):
-    deprecation_message = "Importing `T5LayerCrossAttention` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerCrossAttention`, instead."
-    deprecate("T5LayerCrossAttention", "0.29", deprecation_message)
-
-
-class T5LayerFFCond(T5LayerFFCond):
-    deprecation_message = "Importing `T5LayerFFCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerFFCond`, instead."
-    deprecate("T5LayerFFCond", "0.29", deprecation_message)
-
-
-class T5DenseGatedActDense(T5DenseGatedActDense):
-    deprecation_message = "Importing `T5DenseGatedActDense` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5DenseGatedActDense`, instead."
-    deprecate("T5DenseGatedActDense", "0.29", deprecation_message)
-
-
-class T5LayerNorm(T5LayerNorm):
-    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerNorm`, instead."
-    deprecate("T5LayerNorm", "0.29", deprecation_message)
-
-
-class NewGELUActivation(NewGELUActivation):
-    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import NewGELUActivation`, instead."
-    deprecate("NewGELUActivation", "0.29", deprecation_message)
-
-
-class T5FiLMLayer(T5FiLMLayer):
-    deprecation_message = "Importing `T5FiLMLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FiLMLayer`, instead."
-    deprecate("T5FiLMLayer", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py
deleted file mode 100644
index 5d8ef1347a..0000000000
--- a/src/diffusers/models/transformer_2d.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.transformer_2d import Transformer2DModel, Transformer2DModelOutput
-
-
-class Transformer2DModelOutput(Transformer2DModelOutput):
-    deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead."
-    deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
-
-
-class Transformer2DModel(Transformer2DModel):
-    deprecation_message = "Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead."
-    deprecate("Transformer2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformer_temporal.py b/src/diffusers/models/transformer_temporal.py
deleted file mode 100644
index 02e5045802..0000000000
--- a/src/diffusers/models/transformer_temporal.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .transformers.transformer_temporal import (
-    TransformerSpatioTemporalModel,
-    TransformerTemporalModel,
-    TransformerTemporalModelOutput,
-)
-
-
-class TransformerTemporalModelOutput(TransformerTemporalModelOutput):
-    deprecation_message = "Importing `TransformerTemporalModelOutput` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModelOutput`, instead."
-    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
-
-
-class TransformerTemporalModel(TransformerTemporalModel):
-    deprecation_message = "Importing `TransformerTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModel`, instead."
-    deprecate("TransformerTemporalModel", "0.29", deprecation_message)
-
-
-class TransformerSpatioTemporalModel(TransformerSpatioTemporalModel):
-    deprecation_message = "Importing `TransformerSpatioTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerSpatioTemporalModel`, instead."
-    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
diff --git a/src/diffusers/models/transformers/dual_transformer_2d.py b/src/diffusers/models/transformers/dual_transformer_2d.py
index edc8cbf783..bbf2d387e4 100644
--- a/src/diffusers/models/transformers/dual_transformer_2d.py
+++ b/src/diffusers/models/transformers/dual_transformer_2d.py
@@ -123,9 +123,9 @@ class DualTransformer2DModel(nn.Module):
                 tuple.
 
         Returns:
-            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
-            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
-            returning a tuple, the first element is the sample tensor.
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformers.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
         """
         input_states = hidden_states
 
diff --git a/src/diffusers/models/transformers/prior_transformer.py b/src/diffusers/models/transformers/prior_transformer.py
index 8dbcfc64e0..edac0d9e93 100644
--- a/src/diffusers/models/transformers/prior_transformer.py
+++ b/src/diffusers/models/transformers/prior_transformer.py
@@ -266,13 +266,13 @@ class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, Pef
             attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
                 Text mask for the text embeddings.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
-                tuple.
+                Whether or not to return a [`~models.transformers.prior_transformer.PriorTransformerOutput`] instead of
+                a plain tuple.
 
         Returns:
-            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
-                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
+            [`~models.transformers.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.transformers.prior_transformer.PriorTransformerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
         """
         batch_size = hidden_states.shape[0]
 
diff --git a/src/diffusers/models/transformers/transformer_2d.py b/src/diffusers/models/transformers/transformer_2d.py
index ef9e0de0b6..5f21b2f0e7 100644
--- a/src/diffusers/models/transformers/transformer_2d.py
+++ b/src/diffusers/models/transformers/transformer_2d.py
@@ -377,8 +377,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
                 tuple.
 
         Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
+            If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
+            otherwise a `tuple` where the first element is the sample tensor.
         """
         if cross_attention_kwargs is not None:
             if cross_attention_kwargs.get("scale", None) is not None:
diff --git a/src/diffusers/models/transformers/transformer_temporal.py b/src/diffusers/models/transformers/transformer_temporal.py
index 2e1bb041a2..c0c5467050 100644
--- a/src/diffusers/models/transformers/transformer_temporal.py
+++ b/src/diffusers/models/transformers/transformer_temporal.py
@@ -149,13 +149,14 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
-                tuple.
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
-                returned, otherwise a `tuple` where the first element is the sample tensor.
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
         """
         # 1. Input
         batch_frames, channel, height, width = hidden_states.shape
@@ -294,13 +295,14 @@ class TransformerSpatioTemporalModel(nn.Module):
                 A tensor indicating whether the input contains only images. 1 indicates that the input contains only
                 images, 0 indicates that the input contains video frames.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a
-                plain tuple.
+                Whether or not to return a [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`]
+                instead of a plain tuple.
 
         Returns:
-            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
-                returned, otherwise a `tuple` where the first element is the sample tensor.
+            [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an
+                [`~models.transformers.transformer_temporal.TransformerTemporalModelOutput`] is returned, otherwise a
+                `tuple` where the first element is the sample tensor.
         """
         # 1. Input
         batch_frames, _, height, width = hidden_states.shape
diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py
deleted file mode 100644
index e857c90cae..0000000000
--- a/src/diffusers/models/unet_1d.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..utils import deprecate
-from .unets.unet_1d import UNet1DModel, UNet1DOutput
-
-
-class UNet1DOutput(UNet1DOutput):
-    deprecation_message = "Importing `UNet1DOutput` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DOutput`, instead."
-    deprecate("UNet1DOutput", "0.29", deprecation_message)
-
-
-class UNet1DModel(UNet1DModel):
-    deprecation_message = "Importing `UNet1DModel` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DModel`, instead."
-    deprecate("UNet1DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py
deleted file mode 100644
index 6b0f09457d..0000000000
--- a/src/diffusers/models/unet_1d_blocks.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ..utils import deprecate
-from .unets.unet_1d_blocks import (
-    AttnDownBlock1D,
-    AttnUpBlock1D,
-    DownBlock1D,
-    DownBlock1DNoSkip,
-    DownResnetBlock1D,
-    Downsample1d,
-    MidResTemporalBlock1D,
-    OutConv1DBlock,
-    OutValueFunctionBlock,
-    ResConvBlock,
-    SelfAttention1d,
-    UNetMidBlock1D,
-    UpBlock1D,
-    UpBlock1DNoSkip,
-    UpResnetBlock1D,
-    Upsample1d,
-    ValueFunctionMidBlock1D,
-)
-
-
-class DownResnetBlock1D(DownResnetBlock1D):
-    deprecation_message = "Importing `DownResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownResnetBlock1D`, instead."
-    deprecate("DownResnetBlock1D", "0.29", deprecation_message)
-
-
-class UpResnetBlock1D(UpResnetBlock1D):
-    deprecation_message = "Importing `UpResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpResnetBlock1D`, instead."
-    deprecate("UpResnetBlock1D", "0.29", deprecation_message)
-
-
-class ValueFunctionMidBlock1D(ValueFunctionMidBlock1D):
-    deprecation_message = "Importing `ValueFunctionMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ValueFunctionMidBlock1D`, instead."
-    deprecate("ValueFunctionMidBlock1D", "0.29", deprecation_message)
-
-
-class OutConv1DBlock(OutConv1DBlock):
-    deprecation_message = "Importing `OutConv1DBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutConv1DBlock`, instead."
-    deprecate("OutConv1DBlock", "0.29", deprecation_message)
-
-
-class OutValueFunctionBlock(OutValueFunctionBlock):
-    deprecation_message = "Importing `OutValueFunctionBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutValueFunctionBlock`, instead."
-    deprecate("OutValueFunctionBlock", "0.29", deprecation_message)
-
-
-class Downsample1d(Downsample1d):
-    deprecation_message = "Importing `Downsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Downsample1d`, instead."
-    deprecate("Downsample1d", "0.29", deprecation_message)
-
-
-class Upsample1d(Upsample1d):
-    deprecation_message = "Importing `Upsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Upsample1d`, instead."
-    deprecate("Upsample1d", "0.29", deprecation_message)
-
-
-class SelfAttention1d(SelfAttention1d):
-    deprecation_message = "Importing `SelfAttention1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import SelfAttention1d`, instead."
-    deprecate("SelfAttention1d", "0.29", deprecation_message)
-
-
-class ResConvBlock(ResConvBlock):
-    deprecation_message = "Importing `ResConvBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ResConvBlock`, instead."
-    deprecate("ResConvBlock", "0.29", deprecation_message)
-
-
-class UNetMidBlock1D(UNetMidBlock1D):
-    deprecation_message = "Importing `UNetMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UNetMidBlock1D`, instead."
-    deprecate("UNetMidBlock1D", "0.29", deprecation_message)
-
-
-class AttnDownBlock1D(AttnDownBlock1D):
-    deprecation_message = "Importing `AttnDownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnDownBlock1D`, instead."
-    deprecate("AttnDownBlock1D", "0.29", deprecation_message)
-
-
-class DownBlock1D(DownBlock1D):
-    deprecation_message = "Importing `DownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1D`, instead."
-    deprecate("DownBlock1D", "0.29", deprecation_message)
-
-
-class DownBlock1DNoSkip(DownBlock1DNoSkip):
-    deprecation_message = "Importing `DownBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1DNoSkip`, instead."
-    deprecate("DownBlock1DNoSkip", "0.29", deprecation_message)
-
-
-class AttnUpBlock1D(AttnUpBlock1D):
-    deprecation_message = "Importing `AttnUpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnUpBlock1D`, instead."
-    deprecate("AttnUpBlock1D", "0.29", deprecation_message)
-
-
-class UpBlock1D(UpBlock1D):
-    deprecation_message = "Importing `UpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1D`, instead."
-    deprecate("UpBlock1D", "0.29", deprecation_message)
-
-
-class UpBlock1DNoSkip(UpBlock1DNoSkip):
-    deprecation_message = "Importing `UpBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1DNoSkip`, instead."
-    deprecate("UpBlock1DNoSkip", "0.29", deprecation_message)
-
-
-class MidResTemporalBlock1D(MidResTemporalBlock1D):
-    deprecation_message = "Importing `MidResTemporalBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import MidResTemporalBlock1D`, instead."
-    deprecate("MidResTemporalBlock1D", "0.29", deprecation_message)
-
-
-def get_down_block(
-    down_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    temb_channels: int,
-    add_downsample: bool,
-):
-    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_down_block`, instead."
-    deprecate("get_down_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_down_block
-
-    return get_down_block(
-        down_block_type=down_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_downsample=add_downsample,
-    )
-
-
-def get_up_block(
-    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
-):
-    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_up_block`, instead."
-    deprecate("get_up_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_up_block
-
-    return get_up_block(
-        up_block_type=up_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_upsample=add_upsample,
-    )
-
-
-def get_mid_block(
-    mid_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    mid_channels: int,
-    out_channels: int,
-    embed_dim: int,
-    add_downsample: bool,
-):
-    deprecation_message = "Importing `get_mid_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_mid_block`, instead."
-    deprecate("get_mid_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_mid_block
-
-    return get_mid_block(
-        mid_block_type=mid_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        mid_channels=mid_channels,
-        out_channels=out_channels,
-        embed_dim=embed_dim,
-        add_downsample=add_downsample,
-    )
-
-
-def get_out_block(
-    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
-):
-    deprecation_message = "Importing `get_out_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_out_block`, instead."
-    deprecate("get_out_block", "0.29", deprecation_message)
-
-    from .unets.unet_1d_blocks import get_out_block
-
-    return get_out_block(
-        out_block_type=out_block_type,
-        num_groups_out=num_groups_out,
-        embed_dim=embed_dim,
-        out_channels=out_channels,
-        act_fn=act_fn,
-        fc_dim=fc_dim,
-    )
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
deleted file mode 100644
index 21f1fea68d..0000000000
--- a/src/diffusers/models/unet_2d.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from ..utils import deprecate
-from .unets.unet_2d import UNet2DModel, UNet2DOutput
-
-
-class UNet2DOutput(UNet2DOutput):
-    deprecation_message = "Importing `UNet2DOutput` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DOutput`, instead."
-    deprecate("UNet2DOutput", "0.29", deprecation_message)
-
-
-class UNet2DModel(UNet2DModel):
-    deprecation_message = "Importing `UNet2DModel` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DModel`, instead."
-    deprecate("UNet2DModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
deleted file mode 100644
index 931fa89a73..0000000000
--- a/src/diffusers/models/unet_2d_blocks.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from ..utils import deprecate
-from .unets.unet_2d_blocks import (
-    AttnDownBlock2D,
-    AttnDownEncoderBlock2D,
-    AttnSkipDownBlock2D,
-    AttnSkipUpBlock2D,
-    AttnUpBlock2D,
-    AttnUpDecoderBlock2D,
-    AutoencoderTinyBlock,
-    CrossAttnDownBlock2D,
-    CrossAttnUpBlock2D,
-    DownBlock2D,
-    KAttentionBlock,
-    KCrossAttnDownBlock2D,
-    KCrossAttnUpBlock2D,
-    KDownBlock2D,
-    KUpBlock2D,
-    ResnetDownsampleBlock2D,
-    ResnetUpsampleBlock2D,
-    SimpleCrossAttnDownBlock2D,
-    SimpleCrossAttnUpBlock2D,
-    SkipDownBlock2D,
-    SkipUpBlock2D,
-    UNetMidBlock2D,
-    UNetMidBlock2DCrossAttn,
-    UNetMidBlock2DSimpleCrossAttn,
-    UpBlock2D,
-    UpDecoderBlock2D,
-)
-
-
-def get_down_block(
-    down_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    temb_channels: int,
-    add_downsample: bool,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    resnet_groups: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    downsample_padding: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    resnet_out_scale_factor: float = 1.0,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = None,
-    downsample_type: Optional[str] = None,
-    dropout: float = 0.0,
-):
-    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_down_block`, instead."
-    deprecate("get_down_block", "0.29", deprecation_message)
-
-    from .unets.unet_2d_blocks import get_down_block
-
-    return get_down_block(
-        down_block_type=down_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        temb_channels=temb_channels,
-        add_downsample=add_downsample,
-        resnet_eps=resnet_eps,
-        resnet_act_fn=resnet_act_fn,
-        transformer_layers_per_block=transformer_layers_per_block,
-        num_attention_heads=num_attention_heads,
-        resnet_groups=resnet_groups,
-        cross_attention_dim=cross_attention_dim,
-        downsample_padding=downsample_padding,
-        dual_cross_attention=dual_cross_attention,
-        use_linear_projection=use_linear_projection,
-        only_cross_attention=only_cross_attention,
-        upcast_attention=upcast_attention,
-        resnet_time_scale_shift=resnet_time_scale_shift,
-        attention_type=attention_type,
-        resnet_skip_time_act=resnet_skip_time_act,
-        resnet_out_scale_factor=resnet_out_scale_factor,
-        cross_attention_norm=cross_attention_norm,
-        attention_head_dim=attention_head_dim,
-        downsample_type=downsample_type,
-        dropout=dropout,
-    )
-
-
-def get_mid_block(
-    mid_block_type: str,
-    temb_channels: int,
-    in_channels: int,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    resnet_groups: int,
-    output_scale_factor: float = 1.0,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    mid_block_only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = 1,
-    dropout: float = 0.0,
-):
-    if mid_block_type == "UNetMidBlock2DCrossAttn":
-        return UNetMidBlock2DCrossAttn(
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads,
-            resnet_groups=resnet_groups,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-            attention_type=attention_type,
-        )
-    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-        return UNetMidBlock2DSimpleCrossAttn(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=attention_head_dim,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            only_cross_attention=mid_block_only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-        )
-    elif mid_block_type == "UNetMidBlock2D":
-        return UNetMidBlock2D(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            num_layers=0,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            add_attention=False,
-        )
-    elif mid_block_type is None:
-        return None
-    else:
-        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-
-def get_up_block(
-    up_block_type: str,
-    num_layers: int,
-    in_channels: int,
-    out_channels: int,
-    prev_output_channel: int,
-    temb_channels: int,
-    add_upsample: bool,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    resolution_idx: Optional[int] = None,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    resnet_groups: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    resnet_out_scale_factor: float = 1.0,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = None,
-    upsample_type: Optional[str] = None,
-    dropout: float = 0.0,
-):
-    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_up_block`, instead."
-    deprecate("get_up_block", "0.29", deprecation_message)
-
-    from .unets.unet_2d_blocks import get_up_block
-
-    return get_up_block(
-        up_block_type=up_block_type,
-        num_layers=num_layers,
-        in_channels=in_channels,
-        out_channels=out_channels,
-        prev_output_channel=prev_output_channel,
-        temb_channels=temb_channels,
-        add_upsample=add_upsample,
-        resnet_eps=resnet_eps,
-        resnet_act_fn=resnet_act_fn,
-        resolution_idx=resolution_idx,
-        transformer_layers_per_block=transformer_layers_per_block,
-        num_attention_heads=num_attention_heads,
-        resnet_groups=resnet_groups,
-        cross_attention_dim=cross_attention_dim,
-        dual_cross_attention=dual_cross_attention,
-        use_linear_projection=use_linear_projection,
-        only_cross_attention=only_cross_attention,
-        upcast_attention=upcast_attention,
-        resnet_time_scale_shift=resnet_time_scale_shift,
-        attention_type=attention_type,
-        resnet_skip_time_act=resnet_skip_time_act,
-        resnet_out_scale_factor=resnet_out_scale_factor,
-        cross_attention_norm=cross_attention_norm,
-        attention_head_dim=attention_head_dim,
-        upsample_type=upsample_type,
-        dropout=dropout,
-    )
-
-
-class AutoencoderTinyBlock(AutoencoderTinyBlock):
-    deprecation_message = "Importing `AutoencoderTinyBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AutoencoderTinyBlock`, instead."
-    deprecate("AutoencoderTinyBlock", "0.29", deprecation_message)
-
-
-class UNetMidBlock2D(UNetMidBlock2D):
-    deprecation_message = "Importing `UNetMidBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D`, instead."
-    deprecate("UNetMidBlock2D", "0.29", deprecation_message)
-
-
-class UNetMidBlock2DCrossAttn(UNetMidBlock2DCrossAttn):
-    deprecation_message = "Importing `UNetMidBlock2DCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DCrossAttn`, instead."
-    deprecate("UNetMidBlock2DCrossAttn", "0.29", deprecation_message)
-
-
-class UNetMidBlock2DSimpleCrossAttn(UNetMidBlock2DSimpleCrossAttn):
-    deprecation_message = "Importing `UNetMidBlock2DSimpleCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DSimpleCrossAttn`, instead."
-    deprecate("UNetMidBlock2DSimpleCrossAttn", "0.29", deprecation_message)
-
-
-class AttnDownBlock2D(AttnDownBlock2D):
-    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownBlock2D`, instead."
-    deprecate("AttnDownBlock2D", "0.29", deprecation_message)
-
-
-class CrossAttnDownBlock2D(CrossAttnDownBlock2D):
-    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D`, instead."
-    deprecate("CrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class DownBlock2D(DownBlock2D):
-    deprecation_message = "Importing `DownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import DownBlock2D`, instead."
-    deprecate("DownBlock2D", "0.29", deprecation_message)
-
-
-class AttnDownEncoderBlock2D(AttnDownEncoderBlock2D):
-    deprecation_message = "Importing `AttnDownEncoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownEncoderBlock2D`, instead."
-    deprecate("AttnDownEncoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnSkipDownBlock2D(AttnSkipDownBlock2D):
-    deprecation_message = "Importing `AttnSkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipDownBlock2D`, instead."
-    deprecate("AttnSkipDownBlock2D", "0.29", deprecation_message)
-
-
-class SkipDownBlock2D(SkipDownBlock2D):
-    deprecation_message = "Importing `SkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipDownBlock2D`, instead."
-    deprecate("SkipDownBlock2D", "0.29", deprecation_message)
-
-
-class ResnetDownsampleBlock2D(ResnetDownsampleBlock2D):
-    deprecation_message = "Importing `ResnetDownsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D`, instead."
-    deprecate("ResnetDownsampleBlock2D", "0.29", deprecation_message)
-
-
-class SimpleCrossAttnDownBlock2D(SimpleCrossAttnDownBlock2D):
-    deprecation_message = "Importing `SimpleCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnDownBlock2D`, instead."
-    deprecate("SimpleCrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class KDownBlock2D(KDownBlock2D):
-    deprecation_message = "Importing `KDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KDownBlock2D`, instead."
-    deprecate("KDownBlock2D", "0.29", deprecation_message)
-
-
-class KCrossAttnDownBlock2D(KCrossAttnDownBlock2D):
-    deprecation_message = "Importing `KCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnDownBlock2D`, instead."
-    deprecate("KCrossAttnDownBlock2D", "0.29", deprecation_message)
-
-
-class AttnUpBlock2D(AttnUpBlock2D):
-    deprecation_message = "Importing `AttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpBlock2D`, instead."
-    deprecate("AttnUpBlock2D", "0.29", deprecation_message)
-
-
-class CrossAttnUpBlock2D(CrossAttnUpBlock2D):
-    deprecation_message = "Importing `CrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnUpBlock2D`, instead."
-    deprecate("CrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-class UpBlock2D(UpBlock2D):
-    deprecation_message = "Importing `UpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpBlock2D`, instead."
-    deprecate("UpBlock2D", "0.29", deprecation_message)
-
-
-class UpDecoderBlock2D(UpDecoderBlock2D):
-    deprecation_message = "Importing `UpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpDecoderBlock2D`, instead."
-    deprecate("UpDecoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnUpDecoderBlock2D(AttnUpDecoderBlock2D):
-    deprecation_message = "Importing `AttnUpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpDecoderBlock2D`, instead."
-    deprecate("AttnUpDecoderBlock2D", "0.29", deprecation_message)
-
-
-class AttnSkipUpBlock2D(AttnSkipUpBlock2D):
-    deprecation_message = "Importing `AttnSkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipUpBlock2D`, instead."
-    deprecate("AttnSkipUpBlock2D", "0.29", deprecation_message)
-
-
-class SkipUpBlock2D(SkipUpBlock2D):
-    deprecation_message = "Importing `SkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipUpBlock2D`, instead."
-    deprecate("SkipUpBlock2D", "0.29", deprecation_message)
-
-
-class ResnetUpsampleBlock2D(ResnetUpsampleBlock2D):
-    deprecation_message = "Importing `ResnetUpsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetUpsampleBlock2D`, instead."
-    deprecate("ResnetUpsampleBlock2D", "0.29", deprecation_message)
-
-
-class SimpleCrossAttnUpBlock2D(SimpleCrossAttnUpBlock2D):
-    deprecation_message = "Importing `SimpleCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnUpBlock2D`, instead."
-    deprecate("SimpleCrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-class KUpBlock2D(KUpBlock2D):
-    deprecation_message = "Importing `KUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KUpBlock2D`, instead."
-    deprecate("KUpBlock2D", "0.29", deprecation_message)
-
-
-class KCrossAttnUpBlock2D(KCrossAttnUpBlock2D):
-    deprecation_message = "Importing `KCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnUpBlock2D`, instead."
-    deprecate("KCrossAttnUpBlock2D", "0.29", deprecation_message)
-
-
-# can potentially later be renamed to `No-feed-forward` attention
-class KAttentionBlock(KAttentionBlock):
-    deprecation_message = "Importing `KAttentionBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KAttentionBlock`, instead."
-    deprecate("KAttentionBlock", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
deleted file mode 100644
index 85a3e7b091..0000000000
--- a/src/diffusers/models/unet_2d_condition.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from ..utils import deprecate
-from .unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
-
-
-class UNet2DConditionOutput(UNet2DConditionOutput):
-    deprecation_message = "Importing `UNet2DConditionOutput` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput`, instead."
-    deprecate("UNet2DConditionOutput", "0.29", deprecation_message)
-
-
-class UNet2DConditionModel(UNet2DConditionModel):
-    deprecation_message = "Importing `UNet2DConditionModel` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel`, instead."
-    deprecate("UNet2DConditionModel", "0.29", deprecation_message)
diff --git a/src/diffusers/models/unets/unet_1d.py b/src/diffusers/models/unets/unet_1d.py
index d1538cdc61..8efabd98ee 100644
--- a/src/diffusers/models/unets/unet_1d.py
+++ b/src/diffusers/models/unets/unet_1d.py
@@ -206,11 +206,11 @@ class UNet1DModel(ModelMixin, ConfigMixin):
                 The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
             timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unets.unet_1d.UNet1DOutput`] instead of a plain tuple.
 
         Returns:
-            [`~models.unet_1d.UNet1DOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
+            [`~models.unets.unet_1d.UNet1DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
                 returned where the first element is the sample tensor.
         """
 
diff --git a/src/diffusers/models/unets/unet_2d.py b/src/diffusers/models/unets/unet_2d.py
index 0f36afe3f9..5972505f28 100644
--- a/src/diffusers/models/unets/unet_2d.py
+++ b/src/diffusers/models/unets/unet_2d.py
@@ -257,11 +257,11 @@ class UNet2DModel(ModelMixin, ConfigMixin):
             class_labels (`torch.Tensor`, *optional*, defaults to `None`):
                 Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.unets.unet_2d.UNet2DOutput`] instead of a plain tuple.
 
         Returns:
-            [`~models.unet_2d.UNet2DOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
+            [`~models.unets.unet_2d.UNet2DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
                 returned where the first element is the sample tensor.
         """
         # 0. center input if necessary
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
index ad45a43b50..084b7b64f9 100644
--- a/src/diffusers/models/unets/unet_2d_condition.py
+++ b/src/diffusers/models/unets/unet_2d_condition.py
@@ -110,13 +110,13 @@ class UNet2DConditionModel(
             The dimension of the cross attention features.
         transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
         reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
             blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+            [`~models.unets.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unets.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
         encoder_hid_dim (`int`, *optional*, defaults to None):
             If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
             dimension to `cross_attention_dim`.
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
index b4879fe963..331c8fba44 100644
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -598,15 +598,15 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
             mid_block_additional_residual: (`torch.Tensor`, *optional*):
                 A tensor that if specified is added to the residual of the middle unet block.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
             cross_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
index dbfb4f8025..276f1059bf 100644
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -542,13 +542,13 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
                 `self.processor` in
                 [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         batch_size, channels, num_frames, height, width = sample.shape
 
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 1b62d16d5d..b224d9d733 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -856,13 +856,13 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
             mid_block_additional_residual: (`torch.Tensor`, *optional*):
                 A tensor that if specified is added to the residual of the middle unet block.
             return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
                 tuple.
 
         Returns:
-            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
-                a `tuple` is returned where the first element is the sample tensor.
+            [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_3d_condition.UNet3DConditionOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is the sample tensor.
         """
         # By default samples have to be AT least a multiple of the overall upsampling factor.
         # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
diff --git a/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
index 5613e3618d..bc3acdbece 100644
--- a/src/diffusers/models/unets/unet_spatio_temporal_condition.py
+++ b/src/diffusers/models/unets/unet_spatio_temporal_condition.py
@@ -57,9 +57,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
             The dimension of the cross attention features.
         transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
             The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
-            [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
-            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
         num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
             The number of attention heads.
         dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.