diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml
index 97aea28bdb..8cc4eb6e59 100644
--- a/.github/workflows/pr_test_peft_backend.yml
+++ b/.github/workflows/pr_test_peft_backend.yml
@@ -59,7 +59,7 @@ jobs:
 
     - name: Run fast PyTorch LoRA CPU tests with PEFT backend
       run: |
-        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
           -s -v \
           --make-reports=tests_${{ matrix.config.report }} \
           tests/lora/test_lora_layers_peft.py
diff --git a/docker/diffusers-pytorch-compile-cuda/Dockerfile b/docker/diffusers-pytorch-compile-cuda/Dockerfile
index da9f372bd6..e1e63758a4 100644
--- a/docker/diffusers-pytorch-compile-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-compile-cuda/Dockerfile
@@ -40,7 +40,6 @@ RUN python3.9 -m pip install --no-cache-dir --upgrade pip && \
     numpy \
     scipy \
     tensorboard \
-    transformers \
-    omegaconf
-
+    transformers 
+    
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile
index 877bc6840e..7fc3d8ced9 100644
--- a/docker/diffusers-pytorch-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-cuda/Dockerfile
@@ -40,7 +40,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
     scipy \
     tensorboard \
     transformers \
-    omegaconf \
     pytorch-lightning
 
 CMD ["/bin/bash"]
diff --git a/docker/diffusers-pytorch-xformers-cuda/Dockerfile b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
index 003f8e1165..8f2619c623 100644
--- a/docker/diffusers-pytorch-xformers-cuda/Dockerfile
+++ b/docker/diffusers-pytorch-xformers-cuda/Dockerfile
@@ -40,7 +40,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
         scipy \
         tensorboard \
         transformers \
-        omegaconf \
         xformers
 
 CMD ["/bin/bash"]
diff --git a/docs/source/en/api/models/autoencoderkl.md b/docs/source/en/api/models/autoencoderkl.md
index 72427ab30e..3534c8250d 100644
--- a/docs/source/en/api/models/autoencoderkl.md
+++ b/docs/source/en/api/models/autoencoderkl.md
@@ -33,6 +33,9 @@ model = AutoencoderKL.from_single_file(url)
 ## AutoencoderKL
 
 [[autodoc]] AutoencoderKL
+    - decode
+    - encode
+    - all
 
 ## AutoencoderKLOutput
 
diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md
index fb38687e88..4e1670df77 100644
--- a/docs/source/en/api/pipelines/animatediff.md
+++ b/docs/source/en/api/pipelines/animatediff.md
@@ -235,6 +235,62 @@ export_to_gif(frames, "animation.gif")
     </tr>
 </table>
 
+## Using FreeInit
+
+[FreeInit: Bridging Initialization Gap in Video Diffusion Models](https://arxiv.org/abs/2312.07537) by Tianxing Wu, Chenyang Si, Yuming Jiang, Ziqi Huang, Ziwei Liu.
+
+FreeInit is an effective method that improves temporal consistency and overall quality of videos generated using video-diffusion-models without any addition training. It can be applied to AnimateDiff, ModelScope, VideoCrafter and various other video generation models seamlessly at inference time, and works by iteratively refining the latent-initialization noise. More details can be found it the paper.
+
+The following example demonstrates the usage of FreeInit.
+
+```python
+import torch
+from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+from diffusers.utils import export_to_gif
+
+adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = AnimateDiffPipeline.from_pretrained(model_id, motion_adapter=adapter, torch_dtype=torch.float16).to("cuda")
+pipe.scheduler = DDIMScheduler.from_pretrained(
+    model_id,
+    subfolder="scheduler",
+    beta_schedule="linear",
+    clip_sample=False,
+    timestep_spacing="linspace",
+    steps_offset=1
+)
+
+# enable memory savings
+pipe.enable_vae_slicing()
+pipe.enable_vae_tiling()
+
+# enable FreeInit
+# Refer to the enable_free_init documentation for a full list of configurable parameters
+pipe.enable_free_init(method="butterworth", use_fast_sampling=True)
+
+# run inference
+output = pipe(
+    prompt="a panda playing a guitar, on a boat, in the ocean, high quality",
+    negative_prompt="bad quality, worse quality",
+    num_frames=16,
+    guidance_scale=7.5,
+    num_inference_steps=20,
+    generator=torch.Generator("cpu").manual_seed(666),
+)
+
+# disable FreeInit
+pipe.disable_free_init()
+
+frames = output.frames[0]
+export_to_gif(frames, "animation.gif")
+```
+
+<Tip warning={true}>
+
+FreeInit is not really free - the improved quality comes at the cost of extra computation. It requires sampling a few extra times depending on the `num_iters` parameter that is set when enabling it. Setting the `use_fast_sampling` parameter to `True` can improve the overall performance (at the cost of lower quality compared to when `use_fast_sampling=False` but still better results than vanilla video generation models).
+
+</Tip>
+
 <Tip>
 
 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
@@ -248,6 +304,8 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- __call__
     - enable_freeu
     - disable_freeu
+    - enable_free_init
+    - disable_free_init
     - enable_vae_slicing
     - disable_vae_slicing
     - enable_vae_tiling
diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md
index 3bf1d46fd0..8e0eddd175 100644
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@@ -37,8 +37,10 @@ source .env/bin/activate
 
 You should also install 🤗 Transformers because 🤗 Diffusers relies on its models:
 
+
 <frameworkcontent>
 <pt>
+Note - PyTorch only supports Python 3.8 - 3.11 on Windows.
 ```bash
 pip install diffusers["torch"] transformers
 ```
diff --git a/docs/source/en/using-diffusers/controlnet.md b/docs/source/en/using-diffusers/controlnet.md
index e7f6eb2756..ac4bfa4472 100644
--- a/docs/source/en/using-diffusers/controlnet.md
+++ b/docs/source/en/using-diffusers/controlnet.md
@@ -429,7 +429,7 @@ image = pipe(
 make_image_grid([original_image, canny_image, image], rows=1, cols=3)
 ```
 
-### MultiControlNet
+## MultiControlNet
 
 <Tip>
 
diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
index d9d4a675dd..0ef90c6dd9 100644
--- a/docs/source/en/using-diffusers/loading_adapters.md
+++ b/docs/source/en/using-diffusers/loading_adapters.md
@@ -344,7 +344,8 @@ pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-a
 IP-Adapter relies on an image encoder to generate the image features, if your IP-Adapter weights folder contains a "image_encoder" subfolder, the image encoder will be automatically loaded and registered to the pipeline. Otherwise you can so load a [`~transformers.CLIPVisionModelWithProjection`] model and  pass it to a Stable Diffusion pipeline when you create it.
 
 ```py
-from diffusers import AutoPipelineForText2Image, CLIPVisionModelWithProjection
+from diffusers import AutoPipelineForText2Image
+from transformers import CLIPVisionModelWithProjection
 import torch
 
 image_encoder = CLIPVisionModelWithProjection.from_pretrained(
diff --git a/docs/source/en/using-diffusers/sdxl.md b/docs/source/en/using-diffusers/sdxl.md
index 25b581fc6f..906eb0dbbe 100644
--- a/docs/source/en/using-diffusers/sdxl.md
+++ b/docs/source/en/using-diffusers/sdxl.md
@@ -26,7 +26,7 @@ Before you begin, make sure you have the following libraries installed:
 
 ```py
 # uncomment to install the necessary libraries in Colab
-#!pip install -q diffusers transformers accelerate omegaconf invisible-watermark>=0.2.0
+#!pip install -q diffusers transformers accelerate invisible-watermark>=0.2.0
 ```
 
 <Tip warning={true}>
diff --git a/docs/source/en/using-diffusers/sdxl_turbo.md b/docs/source/en/using-diffusers/sdxl_turbo.md
index 99e1c7000e..ceca5729b1 100644
--- a/docs/source/en/using-diffusers/sdxl_turbo.md
+++ b/docs/source/en/using-diffusers/sdxl_turbo.md
@@ -23,7 +23,7 @@ Before you begin, make sure you have the following libraries installed:
 
 ```py
 # uncomment to install the necessary libraries in Colab
-#!pip install -q diffusers transformers accelerate omegaconf
+#!pip install -q diffusers transformers accelerate
 ```
 
 ## Load model checkpoints
diff --git a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
index ddd8114ae4..e35630e3e8 100644
--- a/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+++ b/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
@@ -38,7 +38,7 @@ from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
-from peft import LoraConfig
+from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
@@ -58,15 +58,17 @@ from diffusers import (
 )
 from diffusers.loaders import LoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import compute_snr
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
 from diffusers.utils import (
     check_min_version,
     convert_all_state_dict_to_peft,
     convert_state_dict_to_diffusers,
     convert_state_dict_to_kohya,
+    convert_unet_state_dict_to_peft,
     is_wandb_available,
 )
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -1277,7 +1279,7 @@ def main(args):
         for name, param in text_encoder_one.named_parameters():
             if "token_embedding" in name:
                 # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
-                param = param.to(dtype=torch.float32)
+                param.data = param.to(dtype=torch.float32)
                 param.requires_grad = True
                 text_lora_parameters_one.append(param)
             else:
@@ -1286,22 +1288,16 @@ def main(args):
         for name, param in text_encoder_two.named_parameters():
             if "token_embedding" in name:
                 # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
-                param = param.to(dtype=torch.float32)
+                param.data = param.to(dtype=torch.float32)
                 param.requires_grad = True
                 text_lora_parameters_two.append(param)
             else:
                 param.requires_grad = False
 
-    # Make sure the trainable params are in float32.
-    if args.mixed_precision == "fp16":
-        models = [unet]
-        if args.train_text_encoder:
-            models.extend([text_encoder_one, text_encoder_two])
-        for model in models:
-            for param in model.parameters():
-                # only upcast trainable parameters (LoRA) into fp32
-                if param.requires_grad:
-                    param.data = param.to(torch.float32)
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
@@ -1313,14 +1309,14 @@ def main(args):
             text_encoder_two_lora_layers_to_save = None
 
             for model in models:
-                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                if isinstance(model, type(unwrap_model(unet))):
                     unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
-                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
                     if args.train_text_encoder:
                         text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
                             get_peft_model_state_dict(model)
                         )
-                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
                     if args.train_text_encoder:
                         text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
                             get_peft_model_state_dict(model)
@@ -1348,27 +1344,44 @@ def main(args):
         while len(models) > 0:
             model = models.pop()
 
-            if isinstance(model, type(accelerator.unwrap_model(unet))):
+            if isinstance(model, type(unwrap_model(unet))):
                 unet_ = model
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
                 text_encoder_one_ = model
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
                 text_encoder_two_ = model
             else:
                 raise ValueError(f"unexpected save model: {model.__class__}")
 
         lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-        LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
 
-        text_encoder_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder." in k}
-        LoraLoaderMixin.load_lora_into_text_encoder(
-            text_encoder_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_one_
-        )
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
 
-        text_encoder_2_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder_2." in k}
-        LoraLoaderMixin.load_lora_into_text_encoder(
-            text_encoder_2_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_two_
-        )
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            cast_training_params(models)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
     accelerator.register_load_state_pre_hook(load_model_hook)
@@ -1383,6 +1396,13 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
 
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+
     unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
 
     if args.train_text_encoder:
@@ -1705,19 +1725,19 @@ def main(args):
         num_train_epochs_text_encoder = int(args.train_text_encoder_frac * args.num_train_epochs)
     elif args.train_text_encoder_ti:  # args.train_text_encoder_ti
         num_train_epochs_text_encoder = int(args.train_text_encoder_ti_frac * args.num_train_epochs)
-
+    # flag used for textual inversion
+    pivoted = False
     for epoch in range(first_epoch, args.num_train_epochs):
         # if performing any kind of optimization of text_encoder params
         if args.train_text_encoder or args.train_text_encoder_ti:
             if epoch == num_train_epochs_text_encoder:
                 print("PIVOT HALFWAY", epoch)
                 # stopping optimization of text_encoder params
-                # re setting the optimizer to optimize only on unet params
-                optimizer.param_groups[1]["lr"] = 0.0
-                optimizer.param_groups[2]["lr"] = 0.0
+                # this flag is used to reset the optimizer to optimize only on unet params
+                pivoted = True
 
             else:
-                # still optimizng the text encoder
+                # still optimizing the text encoder
                 text_encoder_one.train()
                 text_encoder_two.train()
                 # set top parameter requires_grad = True for gradient checkpointing works
@@ -1727,6 +1747,12 @@ def main(args):
 
         unet.train()
         for step, batch in enumerate(train_dataloader):
+            if pivoted:
+                # stopping optimization of text_encoder params
+                # re setting the optimizer to optimize only on unet params
+                optimizer.param_groups[1]["lr"] = 0.0
+                optimizer.param_groups[2]["lr"] = 0.0
+
             with accelerator.accumulate(unet):
                 prompts = batch["prompts"]
                 # encode batch prompts when custom prompts are provided for each image -
@@ -1865,8 +1891,7 @@ def main(args):
 
                 # every step, we reset the embeddings to the original embeddings.
                 if args.train_text_encoder_ti:
-                    for idx, text_encoder in enumerate(text_encoders):
-                        embedding_handler.retract_embeddings()
+                    embedding_handler.retract_embeddings()
 
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
diff --git a/examples/community/README.md b/examples/community/README.md
index 7d8d190f03..45d393f84f 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -58,6 +58,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
 |   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender_A_Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 | StyleAligned Pipeline                                                                                                    | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [StyleAligned Pipeline](#stylealigned-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
+|   IP Adapter FaceID Stable Diffusion                                                                                               | Stable Diffusion Pipeline that supports IP Adapter Face ID                                                                                                                                                                                                                                                                                                                                                  |  [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
 
 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -2989,7 +2990,7 @@ pipe = DiffusionPipeline.from_pretrained(
     custom_pipeline="pipeline_animatediff_controlnet",
 ).to(device="cuda", dtype=torch.float16)
 pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear",
 )
 pipe.enable_vae_slicing()
 
@@ -3005,7 +3006,7 @@ result = pipe(
     width=512,
     height=768,
     conditioning_frames=conditioning_frames,
-    num_inference_steps=12,
+    num_inference_steps=20,
 ).frames[0]
 
 from diffusers.utils import export_to_gif
@@ -3029,6 +3030,79 @@ export_to_gif(result.frames[0], "result.gif")
   </tr>
 </table>
 
+You can also use multiple controlnets at once!
+
+```python
+import torch
+from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter
+from diffusers.pipelines import DiffusionPipeline
+from diffusers.schedulers import DPMSolverMultistepScheduler
+from PIL import Image
+
+motion_id = "guoyww/animatediff-motion-adapter-v1-5-2"
+adapter = MotionAdapter.from_pretrained(motion_id)
+controlnet1 = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_openpose", torch_dtype=torch.float16)
+controlnet2 = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
+
+model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
+pipe = DiffusionPipeline.from_pretrained(
+    model_id,
+    motion_adapter=adapter,
+    controlnet=[controlnet1, controlnet2],
+    vae=vae,
+    custom_pipeline="pipeline_animatediff_controlnet",
+).to(device="cuda", dtype=torch.float16)
+pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
+    model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear",
+)
+pipe.enable_vae_slicing()
+
+def load_video(file_path: str):
+    images = []
+
+    if file_path.startswith(('http://', 'https://')):
+        # If the file_path is a URL
+        response = requests.get(file_path)
+        response.raise_for_status()
+        content = BytesIO(response.content)
+        vid = imageio.get_reader(content)
+    else:
+        # Assuming it's a local file path
+        vid = imageio.get_reader(file_path)
+
+    for frame in vid:
+        pil_image = Image.fromarray(frame)
+        images.append(pil_image)
+
+    return images
+
+video = load_video("dance.gif")
+
+# You need to install it using `pip install controlnet_aux`
+from controlnet_aux.processor import Processor
+
+p1 = Processor("openpose_full")
+cn1 = [p1(frame) for frame in video]
+
+p2 = Processor("canny")
+cn2 = [p2(frame) for frame in video]
+
+prompt = "astronaut in space, dancing"
+negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"
+result = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=512,
+    height=768,
+    conditioning_frames=[cn1, cn2],
+    num_inference_steps=20,
+)
+
+from diffusers.utils import export_to_gif
+export_to_gif(result.frames[0], "result.gif")
+```
+
 ### DemoFusion
 
 This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
@@ -3333,4 +3407,63 @@ images = pipe(
 
 # Disable StyleAligned if you do not wish to use it anymore
 pipe.disable_style_aligned()
-```
\ No newline at end of file
+```
+
+### IP Adapter Face ID
+IP Adapter FaceID is an experimental IP Adapter model that uses image embeddings generated by `insightface`, so no image encoder needs to be loaded.
+You need to install `insightface` and all its requirements to use this model.
+You must pass the image embedding tensor as `image_embeds` to the StableDiffusionPipeline instead of `ip_adapter_image`.
+You have to disable PEFT BACKEND in order to load weights.
+
+```py
+import diffusers
+diffusers.utils.USE_PEFT_BACKEND = False
+import torch
+from diffusers.utils import load_image
+import cv2
+import numpy as np
+from diffusers import DiffusionPipeline, AutoencoderKL, DDIMScheduler
+from insightface.app import FaceAnalysis
+
+
+noise_scheduler = DDIMScheduler(
+    num_train_timesteps=1000,
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False,
+    steps_offset=1,
+)
+vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse").to(dtype=torch.float16)
+pipeline = DiffusionPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V4.0_noVAE",
+    torch_dtype=torch.float16,
+    scheduler=noise_scheduler,
+    vae=vae,
+    custom_pipeline="ip_adapter_face_id"
+)
+pipeline.load_ip_adapter_face_id("h94/IP-Adapter-FaceID", "ip-adapter-faceid_sd15.bin")
+pipeline.to("cuda")
+
+generator = torch.Generator(device="cpu").manual_seed(42)
+num_images=2
+
+image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ai_face2.png")
+
+app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+app.prepare(ctx_id=0, det_size=(640, 640))
+image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
+faces = app.get(image)
+image = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
+images = pipeline(
+    prompt="A photo of a girl wearing a black dress, holding red roses in hand, upper body, behind is the Eiffel Tower",
+    image_embeds=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
+    num_inference_steps=20, num_images_per_prompt=num_images, width=512, height=704, 
+    generator=generator
+).images
+
+for i in range(num_images):
+    images[i].save(f"c{i}.png")
+```
diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py
new file mode 100644
index 0000000000..d9325742cf
--- /dev/null
+++ b/examples/community/ip_adapter_face_id.py
@@ -0,0 +1,1525 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+from safetensors import safe_open
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import FusedAttnProcessor2_0
+from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    deprecate,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LoRAIPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        lora_scale (`float`, defaults to 1.0):
+            the weight scale of LoRA.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+        scale=1.0,
+        num_tokens=4,
+    ):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class LoRAIPAdapterAttnProcessor2_0(nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        lora_scale (`float`, defaults to 1.0):
+            the weight scale of LoRA.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
+            The context length of the image features.
+    """
+
+    def __init__(
+        self,
+        hidden_size,
+        cross_attention_dim=None,
+        rank=4,
+        network_alpha=None,
+        lora_scale=1.0,
+        scale=1.0,
+        num_tokens=4,
+    ):
+        super().__init__()
+
+        self.rank = rank
+        self.lora_scale = lora_scale
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.scale = scale
+        self.num_tokens = num_tokens
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        else:
+            # get encoder_hidden_states, ip_hidden_states
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterFullImageProjection(nn.Module):
+    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
+        super().__init__()
+        from diffusers.models.attention import FeedForward
+
+        self.num_tokens = num_tokens
+        self.cross_attention_dim = cross_attention_dim
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu")
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.FloatTensor):
+        x = self.ff(image_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        return self.norm(x)
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class IPAdapterFaceIDStableDiffusionPipeline(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def load_ip_adapter_face_id(self, pretrained_model_name_or_path_or_dict, weight_name, **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        model_file = _get_model_file(
+            pretrained_model_name_or_path_or_dict,
+            weights_name=weight_name,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+        )
+        if weight_name.endswith(".safetensors"):
+            state_dict = {"image_proj": {}, "ip_adapter": {}}
+            with safe_open(model_file, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("image_proj."):
+                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                    elif key.startswith("ip_adapter."):
+                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(model_file, map_location="cpu")
+        self._load_ip_adapter_weights(state_dict)
+
+    def convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
+        updated_state_dict = {}
+        clip_embeddings_dim_in = state_dict["proj.0.weight"].shape[1]
+        clip_embeddings_dim_out = state_dict["proj.0.weight"].shape[0]
+        multiplier = clip_embeddings_dim_out // clip_embeddings_dim_in
+        norm_layer = "norm.weight"
+        cross_attention_dim = state_dict[norm_layer].shape[0]
+        num_tokens = state_dict["proj.2.weight"].shape[0] // cross_attention_dim
+
+        image_projection = IPAdapterFullImageProjection(
+            cross_attention_dim=cross_attention_dim,
+            image_embed_dim=clip_embeddings_dim_in,
+            mult=multiplier,
+            num_tokens=num_tokens,
+        )
+
+        for key, value in state_dict.items():
+            diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+            diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+            updated_state_dict[diffusers_name] = value
+
+        image_projection.load_state_dict(updated_state_dict)
+        return image_projection
+
+    def _load_ip_adapter_weights(self, state_dict):
+        from diffusers.models.attention_processor import (
+            AttnProcessor,
+            AttnProcessor2_0,
+        )
+
+        num_image_text_embeds = 4
+
+        self.unet.encoder_hid_proj = None
+
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 0
+        for name in self.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None or "motion_modules" in name:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+                rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
+                attn_module = self.unet
+                for n in name.split(".")[:-1]:
+                    attn_module = getattr(attn_module, n)
+                # Set the `lora_layer` attribute of the attention-related matrices.
+                attn_module.to_q.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_q.in_features,
+                        out_features=attn_module.to_q.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_k.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_k.in_features,
+                        out_features=attn_module.to_k.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_v.set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_v.in_features,
+                        out_features=attn_module.to_v.out_features,
+                        rank=rank,
+                    )
+                )
+                attn_module.to_out[0].set_lora_layer(
+                    LoRALinearLayer(
+                        in_features=attn_module.to_out[0].in_features,
+                        out_features=attn_module.to_out[0].out_features,
+                        rank=rank,
+                    )
+                )
+
+                value_dict = {}
+                for k, module in attn_module.named_children():
+                    index = "."
+                    if not hasattr(module, "set_lora_layer"):
+                        index = ".0."
+                        module = module[0]
+                    lora_layer = getattr(module, "lora_layer")
+                    for lora_name, w in lora_layer.state_dict().items():
+                        value_dict.update(
+                            {
+                                f"{k}{index}lora_layer.{lora_name}": state_dict["ip_adapter"][
+                                    f"{key_id}.{k}_lora.{lora_name}"
+                                ]
+                            }
+                        )
+
+                attn_module.load_state_dict(value_dict, strict=False)
+                attn_module.to(dtype=self.dtype, device=self.device)
+                key_id += 1
+            else:
+                rank = state_dict["ip_adapter"][f"{key_id}.to_q_lora.down.weight"].shape[0]
+                attn_processor_class = (
+                    LoRAIPAdapterAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else LoRAIPAdapterAttnProcessor
+                )
+                attn_procs[name] = attn_processor_class(
+                    hidden_size=hidden_size,
+                    cross_attention_dim=cross_attention_dim,
+                    scale=1.0,
+                    rank=rank,
+                    num_tokens=num_image_text_embeds,
+                ).to(dtype=self.dtype, device=self.device)
+
+                value_dict = {}
+                for k, w in attn_procs[name].state_dict().items():
+                    value_dict.update({f"{k}": state_dict["ip_adapter"][f"{key_id}.{k}"]})
+
+                attn_procs[name].load_state_dict(value_dict)
+                key_id += 1
+
+        self.unet.set_attn_processor(attn_procs)
+
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection = self.convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
+
+        self.unet.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
+        self.unet.config.encoder_hid_dim_type = "ip_image_proj"
+
+    def set_ip_adapter_scale(self, scale):
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)):
+                attn_processor.scale = scale
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            image_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated image embeddings.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if image_embeds is not None:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0).to(
+                device=device, dtype=prompt_embeds.dtype
+            )
+            negative_image_embeds = torch.zeros_like(image_embeds)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if image_embeds is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py
index 785f1ee55e..cf0c66bb50 100644
--- a/examples/community/pipeline_animatediff_controlnet.py
+++ b/examples/community/pipeline_animatediff_controlnet.py
@@ -14,7 +14,7 @@
 
 import inspect
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -66,7 +66,7 @@ EXAMPLE_DOC_STRING = """
         ...     custom_pipeline="pipeline_animatediff_controlnet",
         ... ).to(device="cuda", dtype=torch.float16)
         >>> pipe.scheduler = DPMSolverMultistepScheduler.from_pretrained(
-        ...     model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1
+        ...     model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", steps_offset=1, beta_schedule="linear",
         ... )
         >>> pipe.enable_vae_slicing()
 
@@ -83,7 +83,7 @@ EXAMPLE_DOC_STRING = """
         ...     height=768,
         ...     conditioning_frames=conditioning_frames,
         ...     num_inference_steps=12,
-        ... ).frames[0]
+        ... )
 
         >>> from diffusers.utils import export_to_gif
         >>> export_to_gif(result.frames[0], "result.gif")
@@ -151,7 +151,7 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
         motion_adapter: MotionAdapter,
-        controlnet: Union[ControlNetModel, MultiControlNetModel],
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
         scheduler: Union[
             DDIMScheduler,
             PNDMScheduler,
@@ -166,6 +166,9 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
         super().__init__()
         unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
 
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -488,6 +491,7 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
         prompt,
         height,
         width,
+        num_frames,
         callback_steps,
         negative_prompt=None,
         prompt_embeds=None,
@@ -557,31 +561,21 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
             or is_compiled
             and isinstance(self.controlnet._orig_mod, ControlNetModel)
         ):
-            if isinstance(image, list):
-                for image_ in image:
-                    self.check_image(image_, prompt, prompt_embeds)
-            else:
-                self.check_image(image, prompt, prompt_embeds)
+            if not isinstance(image, list):
+                raise TypeError(f"For single controlnet, `image` must be of type `list` but got {type(image)}")
+            if len(image) != num_frames:
+                raise ValueError(f"Excepted image to have length {num_frames} but got {len(image)=}")
         elif (
             isinstance(self.controlnet, MultiControlNetModel)
             or is_compiled
             and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
         ):
-            if not isinstance(image, list):
-                raise TypeError("For multiple controlnets: `image` must be type `list`")
-
-            # When `image` is a nested list:
-            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
-            elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
-            elif len(image) != len(self.controlnet.nets):
-                raise ValueError(
-                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
-                )
-
-            for control_ in image:
-                for image_ in control_:
-                    self.check_image(image_, prompt, prompt_embeds)
+            if not isinstance(image, list) or not isinstance(image[0], list):
+                raise TypeError(f"For multiple controlnets: `image` must be type list of lists but got {type(image)=}")
+            if len(image[0]) != num_frames:
+                raise ValueError(f"Expected length of image sublist as {num_frames} but got {len(image[0])=}")
+            if any(len(img) != len(image[0]) for img in image):
+                raise ValueError("All conditioning frame batches for multicontrolnet must be same size")
         else:
             assert False
 
@@ -913,6 +907,7 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
             prompt=prompt,
             height=height,
             width=width,
+            num_frames=num_frames,
             callback_steps=callback_steps,
             negative_prompt=negative_prompt,
             callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
@@ -1000,9 +995,7 @@ class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMix
                     do_classifier_free_guidance=self.do_classifier_free_guidance,
                     guess_mode=guess_mode,
                 )
-
                 cond_prepared_frames.append(prepared_frame)
-
             conditioning_frames = cond_prepared_frames
         else:
             assert False
diff --git a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
index a63654bc99..44a58fa2a8 100644
--- a/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
+++ b/examples/consistency_distillation/train_lcm_distill_lora_sdxl.py
@@ -51,7 +51,7 @@ from diffusers import (
     UNet2DConditionModel,
 )
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.training_utils import cast_training_params, resolve_interpolation_mode
 from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
 
@@ -860,10 +860,8 @@ def main(args):
 
     # Make sure the trainable params are in float32.
     if args.mixed_precision == "fp16":
-        for param in unet.parameters():
-            # only upcast trainable parameters (LoRA) into fp32
-            if param.requires_grad:
-                param.data = param.to(torch.float32)
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
 
     # Also move the alpha and sigma noise schedules to accelerator.device.
     alpha_schedule = alpha_schedule.to(accelerator.device)
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
index 2d2629b2fd..3724e3d140 100644
--- a/examples/dreambooth/train_dreambooth_lora.py
+++ b/examples/dreambooth/train_dreambooth_lora.py
@@ -35,7 +35,7 @@ from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
 from packaging import version
 from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
 from torch.utils.data import Dataset
@@ -54,7 +54,13 @@ from diffusers import (
 )
 from diffusers.loaders import LoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
 
@@ -892,10 +898,33 @@ def main(args):
                 raise ValueError(f"unexpected save model: {model.__class__}")
 
         lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-        LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
-        LoraLoaderMixin.load_lora_into_text_encoder(
-            lora_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_
-        )
+
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_)
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.append(text_encoder_)
+
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models, dtype=torch.float32)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
     accelerator.register_load_state_pre_hook(load_model_hook)
@@ -910,6 +939,15 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
 
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.append(text_encoder)
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
     # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
     if args.use_8bit_adam:
         try:
diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py
index 8f92e3b442..a995eb3043 100644
--- a/examples/dreambooth/train_dreambooth_lora_sdxl.py
+++ b/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -34,7 +34,7 @@ from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration
 from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
 from packaging import version
-from peft import LoraConfig
+from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
@@ -53,8 +53,13 @@ from diffusers import (
 )
 from diffusers.loaders import LoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import compute_snr
-from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
 
@@ -997,17 +1002,6 @@ def main(args):
         text_encoder_one.add_adapter(text_lora_config)
         text_encoder_two.add_adapter(text_lora_config)
 
-    # Make sure the trainable params are in float32.
-    if args.mixed_precision == "fp16":
-        models = [unet]
-        if args.train_text_encoder:
-            models.extend([text_encoder_one, text_encoder_two])
-        for model in models:
-            for param in model.parameters():
-                # only upcast trainable parameters (LoRA) into fp32
-                if param.requires_grad:
-                    param.data = param.to(torch.float32)
-
     def unwrap_model(model):
         model = accelerator.unwrap_model(model)
         model = model._orig_mod if is_compiled_module(model) else model
@@ -1064,17 +1058,36 @@ def main(args):
                 raise ValueError(f"unexpected save model: {model.__class__}")
 
         lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)
-        LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=unet_)
 
-        text_encoder_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder." in k}
-        LoraLoaderMixin.load_lora_into_text_encoder(
-            text_encoder_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_one_
-        )
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
 
-        text_encoder_2_state_dict = {k: v for k, v in lora_state_dict.items() if "text_encoder_2." in k}
-        LoraLoaderMixin.load_lora_into_text_encoder(
-            text_encoder_2_state_dict, network_alphas=network_alphas, text_encoder=text_encoder_two_
-        )
+        if args.train_text_encoder:
+            # Do we need to call `scale_lora_layers()` here?
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_one_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+                # only upcast trainable parameters (LoRA) into fp32
+                cast_training_params(models)
 
     accelerator.register_save_state_pre_hook(save_model_hook)
     accelerator.register_load_state_pre_hook(load_model_hook)
@@ -1089,6 +1102,15 @@ def main(args):
             args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
 
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(models, dtype=torch.float32)
+
     unet_lora_parameters = list(filter(lambda p: p.requires_grad, unet.parameters()))
 
     if args.train_text_encoder:
@@ -1506,6 +1528,7 @@ def main(args):
                         else unet_lora_parameters
                     )
                     accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py
index 78cb7bc2f9..2af858cfd0 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py
@@ -49,6 +49,7 @@ from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version, deprecate, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -489,6 +490,11 @@ def main():
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
 
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
     # `accelerate` 0.16.0 will have better support for customized saving
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
@@ -845,7 +851,7 @@ def main():
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 # Predict the noise residual and compute loss
-                model_pred = unet(concatenated_noisy_latents, timesteps, encoder_hidden_states).sample
+                model_pred = unet(concatenated_noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
 
                 # Gather the losses across all processes for logging (if we use distributed training).
@@ -919,9 +925,9 @@ def main():
                 # The models need unwrapping because for compatibility in distributed training mode.
                 pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
-                    unet=accelerator.unwrap_model(unet),
-                    text_encoder=accelerator.unwrap_model(text_encoder),
-                    vae=accelerator.unwrap_model(vae),
+                    unet=unwrap_model(unet),
+                    text_encoder=unwrap_model(text_encoder),
+                    vae=unwrap_model(vae),
                     revision=args.revision,
                     variant=args.variant,
                     torch_dtype=weight_dtype,
@@ -965,14 +971,14 @@ def main():
     # Create the pipeline using the trained modules and save it.
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        unet = accelerator.unwrap_model(unet)
+        unet = unwrap_model(unet)
         if args.use_ema:
             ema_unet.copy_to(unet.parameters())
 
         pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
-            text_encoder=accelerator.unwrap_model(text_encoder),
-            vae=accelerator.unwrap_model(vae),
+            text_encoder=unwrap_model(text_encoder),
+            vae=unwrap_model(vae),
             unet=unet,
             revision=args.revision,
             variant=args.variant,
diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
index de59cb1f0b..cab16a6333 100644
--- a/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
+++ b/examples/instruct_pix2pix/train_instruct_pix2pix_sdxl.py
@@ -52,6 +52,7 @@ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_instru
 from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -531,6 +532,11 @@ def main():
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
 
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
     # `accelerate` 0.16.0 will have better support for customized saving
     if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
         # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
@@ -1044,8 +1050,12 @@ def main():
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
 
                 model_pred = unet(
-                    concatenated_noisy_latents, timesteps, encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-                ).sample
+                    concatenated_noisy_latents,
+                    timesteps,
+                    encoder_hidden_states,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
 
                 # Gather the losses across all processes for logging (if we use distributed training).
@@ -1099,7 +1109,7 @@ def main():
             progress_bar.set_postfix(**logs)
 
             ### BEGIN: Perform validation every `validation_epochs` steps
-            if global_step % args.validation_steps == 0 or global_step == 1:
+            if global_step % args.validation_steps == 0:
                 if (args.val_image_url_or_path is not None) and (args.validation_prompt is not None):
                     logger.info(
                         f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
@@ -1115,7 +1125,7 @@ def main():
                     # The models need unwrapping because for compatibility in distributed training mode.
                     pipeline = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
                         args.pretrained_model_name_or_path,
-                        unet=accelerator.unwrap_model(unet),
+                        unet=unwrap_model(unet),
                         text_encoder=text_encoder_1,
                         text_encoder_2=text_encoder_2,
                         tokenizer=tokenizer_1,
@@ -1177,7 +1187,7 @@ def main():
     # Create the pipeline using the trained modules and save it.
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        unet = accelerator.unwrap_model(unet)
+        unet = unwrap_model(unet)
         if args.use_ema:
             ema_unet.copy_to(unet.parameters())
 
diff --git a/examples/research_projects/realfill/requirements.txt b/examples/research_projects/realfill/requirements.txt
index 5d69d84563..f6abdc6e7e 100644
--- a/examples/research_projects/realfill/requirements.txt
+++ b/examples/research_projects/realfill/requirements.txt
@@ -6,4 +6,4 @@ torch==2.0.1
 torchvision>=0.16
 ftfy==6.1.1
 tensorboard==2.14.0
-Jinja2==3.1.2
+Jinja2==3.1.3
diff --git a/examples/t2i_adapter/train_t2i_adapter_sdxl.py b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
index e06fc227af..645f1f04e1 100644
--- a/examples/t2i_adapter/train_t2i_adapter_sdxl.py
+++ b/examples/t2i_adapter/train_t2i_adapter_sdxl.py
@@ -50,6 +50,7 @@ from diffusers import (
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 MAX_SEQ_LENGTH = 77
@@ -926,6 +927,11 @@ def main(args):
         else:
             raise ValueError("xformers is not available. Make sure it is installed correctly")
 
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
 
@@ -935,9 +941,9 @@ def main(args):
         " doing mixed precision training, copy of the weights should still be float32."
     )
 
-    if accelerator.unwrap_model(t2iadapter).dtype != torch.float32:
+    if unwrap_model(t2iadapter).dtype != torch.float32:
         raise ValueError(
-            f"Controlnet loaded as datatype {accelerator.unwrap_model(t2iadapter).dtype}. {low_precision_error_string}"
+            f"Controlnet loaded as datatype {unwrap_model(t2iadapter).dtype}. {low_precision_error_string}"
         )
 
     # Enable TF32 for faster training on Ampere GPUs,
@@ -1198,7 +1204,8 @@ def main(args):
                     encoder_hidden_states=batch["prompt_ids"],
                     added_cond_kwargs=batch["unet_added_conditions"],
                     down_block_additional_residuals=down_block_additional_residuals,
-                ).sample
+                    return_dict=False,
+                )[0]
 
                 # Denoise the latents
                 denoised_latents = model_pred * (-sigmas) + noisy_latents
@@ -1279,7 +1286,7 @@ def main(args):
     # Create the pipeline using using the trained modules and save it.
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        t2iadapter = accelerator.unwrap_model(t2iadapter)
+        t2iadapter = unwrap_model(t2iadapter)
         t2iadapter.save_pretrained(args.output_dir)
 
         if args.push_to_hub:
diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py
index 27dedc8f7f..906884eea7 100644
--- a/examples/text_to_image/train_text_to_image_lora.py
+++ b/examples/text_to_image/train_text_to_image_lora.py
@@ -43,9 +43,10 @@ from transformers import CLIPTextModel, CLIPTokenizer
 import diffusers
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import compute_snr
+from diffusers.training_utils import cast_training_params, compute_snr
 from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -466,10 +467,8 @@ def main():
     # Add adapter and make sure the trainable params are in float32.
     unet.add_adapter(unet_lora_config)
     if args.mixed_precision == "fp16":
-        for param in unet.parameters():
-            # only upcast trainable parameters (LoRA) into fp32
-            if param.requires_grad:
-                param.data = param.to(torch.float32)
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
 
     if args.enable_xformers_memory_efficient_attention:
         if is_xformers_available():
@@ -598,6 +597,11 @@ def main():
         ]
     )
 
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
     def preprocess_train(examples):
         images = [image.convert("RGB") for image in examples[image_column]]
         examples["pixel_values"] = [train_transforms(image) for image in images]
@@ -731,7 +735,7 @@ def main():
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
                 # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
 
                 # Get the target for loss depending on the prediction type
                 if args.prediction_type is not None:
@@ -746,7 +750,7 @@ def main():
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
 
                 # Predict the noise residual and compute loss
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
 
                 if args.snr_gamma is None:
                     loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
@@ -811,7 +815,7 @@ def main():
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
 
-                        unwrapped_unet = accelerator.unwrap_model(unet)
+                        unwrapped_unet = unwrap_model(unet)
                         unet_lora_state_dict = convert_state_dict_to_diffusers(
                             get_peft_model_state_dict(unwrapped_unet)
                         )
@@ -839,7 +843,7 @@ def main():
                 # create pipeline
                 pipeline = DiffusionPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
-                    unet=accelerator.unwrap_model(unet),
+                    unet=unwrap_model(unet),
                     revision=args.revision,
                     variant=args.variant,
                     torch_dtype=weight_dtype,
@@ -880,7 +884,7 @@ def main():
     if accelerator.is_main_process:
         unet = unet.to(torch.float32)
 
-        unwrapped_unet = accelerator.unwrap_model(unet)
+        unwrapped_unet = unwrap_model(unet)
         unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unwrapped_unet))
         StableDiffusionPipeline.save_lora_weights(
             save_directory=args.output_dir,
diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py
index 606a88f55b..6b13f75ead 100644
--- a/examples/text_to_image/train_text_to_image_lora_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -51,9 +51,10 @@ from diffusers import (
 )
 from diffusers.loaders import LoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import compute_snr
+from diffusers.training_utils import cast_training_params, compute_snr
 from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -460,13 +461,12 @@ def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
             text_input_ids = text_input_ids_list[i]
 
         prompt_embeds = text_encoder(
-            text_input_ids.to(text_encoder.device),
-            output_hidden_states=True,
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
         )
 
         # We are only ALWAYS interested in the pooled output of the final text encoder
         pooled_prompt_embeds = prompt_embeds[0]
-        prompt_embeds = prompt_embeds.hidden_states[-2]
+        prompt_embeds = prompt_embeds[-1][-2]
         bs_embed, seq_len, _ = prompt_embeds.shape
         prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
         prompt_embeds_list.append(prompt_embeds)
@@ -634,11 +634,13 @@ def main(args):
         models = [unet]
         if args.train_text_encoder:
             models.extend([text_encoder_one, text_encoder_two])
-        for model in models:
-            for param in model.parameters():
-                # only upcast trainable parameters (LoRA) into fp32
-                if param.requires_grad:
-                    param.data = param.to(torch.float32)
+            # only upcast trainable parameters (LoRA) into fp32
+            cast_training_params(models, dtype=torch.float32)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
 
     # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
     def save_model_hook(models, weights, output_dir):
@@ -650,13 +652,13 @@ def main(args):
             text_encoder_two_lora_layers_to_save = None
 
             for model in models:
-                if isinstance(model, type(accelerator.unwrap_model(unet))):
+                if isinstance(model, type(unwrap_model(unet))):
                     unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
-                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
                     text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
                         get_peft_model_state_dict(model)
                     )
-                elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+                elif isinstance(model, type(unwrap_model(text_encoder_two))):
                     text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
                         get_peft_model_state_dict(model)
                     )
@@ -681,11 +683,11 @@ def main(args):
         while len(models) > 0:
             model = models.pop()
 
-            if isinstance(model, type(accelerator.unwrap_model(unet))):
+            if isinstance(model, type(unwrap_model(unet))):
                 unet_ = model
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_one))):
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
                 text_encoder_one_ = model
-            elif isinstance(model, type(accelerator.unwrap_model(text_encoder_two))):
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
                 text_encoder_two_ = model
             else:
                 raise ValueError(f"unexpected save model: {model.__class__}")
@@ -834,6 +836,9 @@ def main(args):
         for image in images:
             original_sizes.append((image.height, image.width))
             image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
             if args.center_crop:
                 y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
                 x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
@@ -841,10 +846,6 @@ def main(args):
             else:
                 y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
                 image = crop(image, y1, x1, h, w)
-            if args.random_flip and random.random() < 0.5:
-                # flip
-                x1 = image.width - x1
-                image = train_flip(image)
             crop_top_left = (y1, x1)
             crop_top_lefts.append(crop_top_left)
             image = train_transforms(image)
@@ -1034,8 +1035,12 @@ def main(args):
                 )
                 unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
                 model_pred = unet(
-                    noisy_model_input, timesteps, prompt_embeds, added_cond_kwargs=unet_added_conditions
-                ).sample
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
 
                 # Get the target for loss depending on the prediction type
                 if args.prediction_type is not None:
@@ -1128,9 +1133,9 @@ def main(args):
                 pipeline = StableDiffusionXLPipeline.from_pretrained(
                     args.pretrained_model_name_or_path,
                     vae=vae,
-                    text_encoder=accelerator.unwrap_model(text_encoder_one),
-                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
-                    unet=accelerator.unwrap_model(unet),
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    unet=unwrap_model(unet),
                     revision=args.revision,
                     variant=args.variant,
                     torch_dtype=weight_dtype,
@@ -1169,12 +1174,12 @@ def main(args):
     # Save the lora layers
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        unet = accelerator.unwrap_model(unet)
+        unet = unwrap_model(unet)
         unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
 
         if args.train_text_encoder:
-            text_encoder_one = accelerator.unwrap_model(text_encoder_one)
-            text_encoder_two = accelerator.unwrap_model(text_encoder_two)
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
 
             text_encoder_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_one))
             text_encoder_2_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_two))
diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py
index 0bb57b1f31..5ec27c4c49 100644
--- a/examples/text_to_image/train_text_to_image_sdxl.py
+++ b/examples/text_to_image/train_text_to_image_sdxl.py
@@ -44,16 +44,12 @@ from tqdm.auto import tqdm
 from transformers import AutoTokenizer, PretrainedConfig
 
 import diffusers
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    StableDiffusionXLPipeline,
-    UNet2DConditionModel,
-)
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionXLPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel, compute_snr
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
@@ -508,11 +504,12 @@ def encode_prompt(batch, text_encoders, tokenizers, proportion_empty_prompts, ca
             prompt_embeds = text_encoder(
                 text_input_ids.to(text_encoder.device),
                 output_hidden_states=True,
+                return_dict=False,
             )
 
             # We are only ALWAYS interested in the pooled output of the final text encoder
             pooled_prompt_embeds = prompt_embeds[0]
-            prompt_embeds = prompt_embeds.hidden_states[-2]
+            prompt_embeds = prompt_embeds[-1][-2]
             bs_embed, seq_len, _ = prompt_embeds.shape
             prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
             prompt_embeds_list.append(prompt_embeds)
@@ -842,6 +839,9 @@ def main(args):
         for image in images:
             original_sizes.append((image.height, image.width))
             image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
             if args.center_crop:
                 y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
                 x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
@@ -849,10 +849,6 @@ def main(args):
             else:
                 y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
                 image = crop(image, y1, x1, h, w)
-            if args.random_flip and random.random() < 0.5:
-                # flip
-                x1 = image.width - x1
-                image = train_flip(image)
             crop_top_left = (y1, x1)
             crop_top_lefts.append(crop_top_left)
             image = train_transforms(image)
@@ -955,6 +951,12 @@ def main(args):
     if accelerator.is_main_process:
         accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))
 
+    # Function for unwraping if torch.compile() was used in accelerate.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
     # Train!
     total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
 
@@ -1054,8 +1056,12 @@ def main(args):
                 pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
                 unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
                 model_pred = unet(
-                    noisy_model_input, timesteps, prompt_embeds, added_cond_kwargs=unet_added_conditions
-                ).sample
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
 
                 # Get the target for loss depending on the prediction type
                 if args.prediction_type is not None:
@@ -1206,7 +1212,7 @@ def main(args):
 
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
-        unet = accelerator.unwrap_model(unet)
+        unet = unwrap_model(unet)
         if args.use_ema:
             ema_unet.copy_to(unet.parameters())
 
diff --git a/scripts/conversion_ldm_uncond.py b/scripts/conversion_ldm_uncond.py
index d2ebb3934b..8c22cc1ce8 100644
--- a/scripts/conversion_ldm_uncond.py
+++ b/scripts/conversion_ldm_uncond.py
@@ -1,13 +1,13 @@
 import argparse
 
-import OmegaConf
 import torch
+import yaml
 
 from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel
 
 
 def convert_ldm_original(checkpoint_path, config_path, output_path):
-    config = OmegaConf.load(config_path)
+    config = yaml.safe_load(config_path)
     state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
     keys = list(state_dict.keys())
 
@@ -25,8 +25,8 @@ def convert_ldm_original(checkpoint_path, config_path, output_path):
         if key.startswith(unet_key):
             unet_state_dict[key.replace(unet_key, "")] = state_dict[key]
 
-    vqvae_init_args = config.model.params.first_stage_config.params
-    unet_init_args = config.model.params.unet_config.params
+    vqvae_init_args = config["model"]["params"]["first_stage_config"]["params"]
+    unet_init_args = config["model"]["params"]["unet_config"]["params"]
 
     vqvae = VQModel(**vqvae_init_args).eval()
     vqvae.load_state_dict(first_stage_dict)
@@ -35,10 +35,10 @@ def convert_ldm_original(checkpoint_path, config_path, output_path):
     unet.load_state_dict(unet_state_dict)
 
     noise_scheduler = DDIMScheduler(
-        timesteps=config.model.params.timesteps,
+        timesteps=config["model"]["params"]["timesteps"],
         beta_schedule="scaled_linear",
-        beta_start=config.model.params.linear_start,
-        beta_end=config.model.params.linear_end,
+        beta_start=config["model"]["params"]["linear_start"],
+        beta_end=config["model"]["params"]["linear_end"],
         clip_sample=False,
     )
 
diff --git a/scripts/convert_gligen_to_diffusers.py b/scripts/convert_gligen_to_diffusers.py
index 816e4c112e..30d789b606 100644
--- a/scripts/convert_gligen_to_diffusers.py
+++ b/scripts/convert_gligen_to_diffusers.py
@@ -2,6 +2,7 @@ import argparse
 import re
 
 import torch
+import yaml
 from transformers import (
     CLIPProcessor,
     CLIPTextModel,
@@ -28,8 +29,6 @@ from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
     textenc_conversion_map,
     textenc_pattern,
 )
-from diffusers.utils import is_omegaconf_available
-from diffusers.utils.import_utils import BACKENDS_MAPPING
 
 
 def convert_open_clip_checkpoint(checkpoint):
@@ -370,52 +369,52 @@ def convert_gligen_unet_checkpoint(checkpoint, config, path=None, extract_ema=Fa
 
 
 def create_vae_config(original_config, image_size: int):
-    vae_params = original_config.autoencoder.params.ddconfig
-    _ = original_config.autoencoder.params.embed_dim
+    vae_params = original_config["autoencoder"]["params"]["ddconfig"]
+    _ = original_config["autoencoder"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
     }
 
     return config
 
 
 def create_unet_config(original_config, image_size: int, attention_type):
-    unet_params = original_config.model.params
-    vae_params = original_config.autoencoder.params.ddconfig
+    unet_params = original_config["model"]["params"]
+    vae_params = original_config["autoencoder"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
     use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
     )
     if use_linear_projection:
         if head_dim is None:
@@ -423,11 +422,11 @@ def create_unet_config(original_config, image_size: int, attention_type):
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
+        "in_channels": unet_params["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
-        "cross_attention_dim": unet_params.context_dim,
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": unet_params["context_dim"],
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
         "attention_type": attention_type,
@@ -445,11 +444,6 @@ def convert_gligen_to_diffusers(
     num_in_channels: int = None,
     device: str = None,
 ):
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if device is None:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         checkpoint = torch.load(checkpoint_path, map_location=device)
@@ -461,14 +455,14 @@ def convert_gligen_to_diffusers(
     else:
         print("global_step key not found in model")
 
-    original_config = OmegaConf.load(original_config_file)
+    original_config = yaml.safe_load(original_config_file)
 
     if num_in_channels is not None:
         original_config["model"]["params"]["in_channels"] = num_in_channels
 
-    num_train_timesteps = original_config.diffusion.params.timesteps
-    beta_start = original_config.diffusion.params.linear_start
-    beta_end = original_config.diffusion.params.linear_end
+    num_train_timesteps = original_config["diffusion"]["params"]["timesteps"]
+    beta_start = original_config["diffusion"]["params"]["linear_start"]
+    beta_end = original_config["diffusion"]["params"]["linear_end"]
 
     scheduler = DDIMScheduler(
         beta_end=beta_end,
diff --git a/scripts/convert_if.py b/scripts/convert_if.py
index 66d7f694c8..c4588f4b25 100644
--- a/scripts/convert_if.py
+++ b/scripts/convert_if.py
@@ -4,6 +4,7 @@ import os
 
 import numpy as np
 import torch
+import yaml
 from torch.nn import functional as F
 from transformers import CLIPConfig, CLIPImageProcessor, CLIPVisionModelWithProjection, T5EncoderModel, T5Tokenizer
 
@@ -11,14 +12,6 @@ from diffusers import DDPMScheduler, IFPipeline, IFSuperResolutionPipeline, UNet
 from diffusers.pipelines.deepfloyd_if.safety_checker import IFSafetyChecker
 
 
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the IF checkpoints. Please install it with `pip install" " OmegaConf`."
-    )
-
-
 def parse_args():
     parser = argparse.ArgumentParser()
 
@@ -143,8 +136,8 @@ def convert_super_res_pipeline(tokenizer, text_encoder, feature_extractor, safet
 
 
 def get_stage_1_unet(unet_config, unet_checkpoint_path):
-    original_unet_config = OmegaConf.load(unet_config)
-    original_unet_config = original_unet_config.params
+    original_unet_config = yaml.safe_load(unet_config)
+    original_unet_config = original_unet_config["params"]
 
     unet_diffusers_config = create_unet_diffusers_config(original_unet_config)
 
@@ -215,11 +208,11 @@ def convert_safety_checker(p_head_path, w_head_path):
 
 
 def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
-    attention_resolutions = parse_list(original_unet_config.attention_resolutions)
-    attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions]
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
 
-    channel_mult = parse_list(original_unet_config.channel_mult)
-    block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult]
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
 
     down_block_types = []
     resolution = 1
@@ -227,7 +220,7 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
     for i in range(len(block_out_channels)):
         if resolution in attention_resolutions:
             block_type = "SimpleCrossAttnDownBlock2D"
-        elif original_unet_config.resblock_updown:
+        elif original_unet_config["resblock_updown"]:
             block_type = "ResnetDownsampleBlock2D"
         else:
             block_type = "DownBlock2D"
@@ -241,17 +234,17 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
     for i in range(len(block_out_channels)):
         if resolution in attention_resolutions:
             block_type = "SimpleCrossAttnUpBlock2D"
-        elif original_unet_config.resblock_updown:
+        elif original_unet_config["resblock_updown"]:
             block_type = "ResnetUpsampleBlock2D"
         else:
             block_type = "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    head_dim = original_unet_config.num_head_channels
+    head_dim = original_unet_config["num_head_channels"]
 
     use_linear_projection = (
-        original_unet_config.use_linear_in_transformer
+        original_unet_config["use_linear_in_transformer"]
         if "use_linear_in_transformer" in original_unet_config
         else False
     )
@@ -264,27 +257,27 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
 
     if class_embed_type is None:
         if "num_classes" in original_unet_config:
-            if original_unet_config.num_classes == "sequential":
+            if original_unet_config["num_classes"] == "sequential":
                 class_embed_type = "projection"
                 assert "adm_in_channels" in original_unet_config
-                projection_class_embeddings_input_dim = original_unet_config.adm_in_channels
+                projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
             else:
                 raise NotImplementedError(
-                    f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}"
+                    f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
                 )
 
     config = {
-        "sample_size": original_unet_config.image_size,
-        "in_channels": original_unet_config.in_channels,
+        "sample_size": original_unet_config["image_size"],
+        "in_channels": original_unet_config["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": original_unet_config.num_res_blocks,
-        "cross_attention_dim": original_unet_config.encoder_channels,
+        "layers_per_block": original_unet_config["num_res_blocks"],
+        "cross_attention_dim": original_unet_config["encoder_channels"],
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
         "class_embed_type": class_embed_type,
         "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
-        "out_channels": original_unet_config.out_channels,
+        "out_channels": original_unet_config["out_channels"],
         "up_block_types": tuple(up_block_types),
         "upcast_attention": False,  # TODO: guessing
         "cross_attention_norm": "group_norm",
@@ -293,11 +286,11 @@ def create_unet_diffusers_config(original_unet_config, class_embed_type=None):
         "act_fn": "gelu",
     }
 
-    if original_unet_config.use_scale_shift_norm:
+    if original_unet_config["use_scale_shift_norm"]:
         config["resnet_time_scale_shift"] = "scale_shift"
 
     if "encoder_dim" in original_unet_config:
-        config["encoder_hid_dim"] = original_unet_config.encoder_dim
+        config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
 
     return config
 
@@ -725,15 +718,15 @@ def parse_list(value):
 def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_size=None):
     orig_path = unet_checkpoint_path
 
-    original_unet_config = OmegaConf.load(os.path.join(orig_path, "config.yml"))
-    original_unet_config = original_unet_config.params
+    original_unet_config = yaml.safe_load(os.path.join(orig_path, "config.yml"))
+    original_unet_config = original_unet_config["params"]
 
     unet_diffusers_config = superres_create_unet_diffusers_config(original_unet_config)
-    unet_diffusers_config["time_embedding_dim"] = original_unet_config.model_channels * int(
-        original_unet_config.channel_mult.split(",")[-1]
+    unet_diffusers_config["time_embedding_dim"] = original_unet_config["model_channels"] * int(
+        original_unet_config["channel_mult"].split(",")[-1]
     )
-    if original_unet_config.encoder_dim != original_unet_config.encoder_channels:
-        unet_diffusers_config["encoder_hid_dim"] = original_unet_config.encoder_dim
+    if original_unet_config["encoder_dim"] != original_unet_config["encoder_channels"]:
+        unet_diffusers_config["encoder_hid_dim"] = original_unet_config["encoder_dim"]
         unet_diffusers_config["class_embed_type"] = "timestep"
         unet_diffusers_config["addition_embed_type"] = "text"
 
@@ -742,16 +735,16 @@ def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_siz
     unet_diffusers_config["resnet_out_scale_factor"] = 1 / 0.7071
     unet_diffusers_config["mid_block_scale_factor"] = 1 / 0.7071
     unet_diffusers_config["only_cross_attention"] = (
-        bool(original_unet_config.disable_self_attentions)
+        bool(original_unet_config["disable_self_attentions"])
         if (
             "disable_self_attentions" in original_unet_config
-            and isinstance(original_unet_config.disable_self_attentions, int)
+            and isinstance(original_unet_config["disable_self_attentions"], int)
         )
         else True
     )
 
     if sample_size is None:
-        unet_diffusers_config["sample_size"] = original_unet_config.image_size
+        unet_diffusers_config["sample_size"] = original_unet_config["image_size"]
     else:
         # The second upscaler unet's sample size is incorrectly specified
         # in the config and is instead hardcoded in source
@@ -783,11 +776,11 @@ def get_super_res_unet(unet_checkpoint_path, verify_param_count=True, sample_siz
 
 
 def superres_create_unet_diffusers_config(original_unet_config):
-    attention_resolutions = parse_list(original_unet_config.attention_resolutions)
-    attention_resolutions = [original_unet_config.image_size // int(res) for res in attention_resolutions]
+    attention_resolutions = parse_list(original_unet_config["attention_resolutions"])
+    attention_resolutions = [original_unet_config["image_size"] // int(res) for res in attention_resolutions]
 
-    channel_mult = parse_list(original_unet_config.channel_mult)
-    block_out_channels = [original_unet_config.model_channels * mult for mult in channel_mult]
+    channel_mult = parse_list(original_unet_config["channel_mult"])
+    block_out_channels = [original_unet_config["model_channels"] * mult for mult in channel_mult]
 
     down_block_types = []
     resolution = 1
@@ -795,7 +788,7 @@ def superres_create_unet_diffusers_config(original_unet_config):
     for i in range(len(block_out_channels)):
         if resolution in attention_resolutions:
             block_type = "SimpleCrossAttnDownBlock2D"
-        elif original_unet_config.resblock_updown:
+        elif original_unet_config["resblock_updown"]:
             block_type = "ResnetDownsampleBlock2D"
         else:
             block_type = "DownBlock2D"
@@ -809,16 +802,16 @@ def superres_create_unet_diffusers_config(original_unet_config):
     for i in range(len(block_out_channels)):
         if resolution in attention_resolutions:
             block_type = "SimpleCrossAttnUpBlock2D"
-        elif original_unet_config.resblock_updown:
+        elif original_unet_config["resblock_updown"]:
             block_type = "ResnetUpsampleBlock2D"
         else:
             block_type = "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    head_dim = original_unet_config.num_head_channels
+    head_dim = original_unet_config["num_head_channels"]
     use_linear_projection = (
-        original_unet_config.use_linear_in_transformer
+        original_unet_config["use_linear_in_transformer"]
         if "use_linear_in_transformer" in original_unet_config
         else False
     )
@@ -831,26 +824,26 @@ def superres_create_unet_diffusers_config(original_unet_config):
     projection_class_embeddings_input_dim = None
 
     if "num_classes" in original_unet_config:
-        if original_unet_config.num_classes == "sequential":
+        if original_unet_config["num_classes"] == "sequential":
             class_embed_type = "projection"
             assert "adm_in_channels" in original_unet_config
-            projection_class_embeddings_input_dim = original_unet_config.adm_in_channels
+            projection_class_embeddings_input_dim = original_unet_config["adm_in_channels"]
         else:
             raise NotImplementedError(
-                f"Unknown conditional unet num_classes config: {original_unet_config.num_classes}"
+                f"Unknown conditional unet num_classes config: {original_unet_config['num_classes']}"
             )
 
     config = {
-        "in_channels": original_unet_config.in_channels,
+        "in_channels": original_unet_config["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": tuple(original_unet_config.num_res_blocks),
-        "cross_attention_dim": original_unet_config.encoder_channels,
+        "layers_per_block": tuple(original_unet_config["num_res_blocks"]),
+        "cross_attention_dim": original_unet_config["encoder_channels"],
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
         "class_embed_type": class_embed_type,
         "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
-        "out_channels": original_unet_config.out_channels,
+        "out_channels": original_unet_config["out_channels"],
         "up_block_types": tuple(up_block_types),
         "upcast_attention": False,  # TODO: guessing
         "cross_attention_norm": "group_norm",
@@ -858,7 +851,7 @@ def superres_create_unet_diffusers_config(original_unet_config):
         "act_fn": "gelu",
     }
 
-    if original_unet_config.use_scale_shift_norm:
+    if original_unet_config["use_scale_shift_norm"]:
         config["resnet_time_scale_shift"] = "scale_shift"
 
     return config
diff --git a/scripts/convert_original_audioldm2_to_diffusers.py b/scripts/convert_original_audioldm2_to_diffusers.py
index f0b22cb4b4..8c9878526a 100644
--- a/scripts/convert_original_audioldm2_to_diffusers.py
+++ b/scripts/convert_original_audioldm2_to_diffusers.py
@@ -19,6 +19,7 @@ import re
 from typing import List, Union
 
 import torch
+import yaml
 from transformers import (
     AutoFeatureExtractor,
     AutoTokenizer,
@@ -45,7 +46,7 @@ from diffusers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import is_omegaconf_available, is_safetensors_available
+from diffusers.utils import is_safetensors_available
 from diffusers.utils.import_utils import BACKENDS_MAPPING
 
 
@@ -212,41 +213,41 @@ def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a UNet config for diffusers based on the config of the original AudioLDM2 model.
     """
-    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
-    cross_attention_dim = list(unet_params.context_dim) if "context_dim" in unet_params else block_out_channels
+    cross_attention_dim = list(unet_params["context_dim"]) if "context_dim" in unet_params else block_out_channels
     if len(cross_attention_dim) > 1:
         # require two or more cross-attention layers per-block, each of different dimension
         cross_attention_dim = [cross_attention_dim for _ in range(len(block_out_channels))]
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
-        "out_channels": unet_params.out_channels,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
-        "transformer_layers_per_block": unet_params.transformer_depth,
+        "layers_per_block": unet_params["num_res_blocks"],
+        "transformer_layers_per_block": unet_params["transformer_depth"],
         "cross_attention_dim": tuple(cross_attention_dim),
     }
 
@@ -259,24 +260,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     Creates a VAE config for diffusers based on the config of the original AudioLDM2 model. Compared to the original
     Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
-    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
         "scaling_factor": float(scaling_factor),
     }
     return config
@@ -285,9 +286,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
         beta_schedule="scaled_linear",
     )
     return schedular
@@ -692,17 +693,17 @@ def create_transformers_vocoder_config(original_config):
     """
     Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
     """
-    vocoder_params = original_config.model.params.vocoder_config.params
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
 
     config = {
-        "model_in_dim": vocoder_params.num_mels,
-        "sampling_rate": vocoder_params.sampling_rate,
-        "upsample_initial_channel": vocoder_params.upsample_initial_channel,
-        "upsample_rates": list(vocoder_params.upsample_rates),
-        "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes),
-        "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes),
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
         "resblock_dilation_sizes": [
-            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
         ],
         "normalize_before": False,
     }
@@ -876,11 +877,6 @@ def load_pipeline_from_original_AudioLDM2_ckpt(
         return: An AudioLDM2Pipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
 
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if from_safetensors:
         if not is_safetensors_available():
             raise ValueError(BACKENDS_MAPPING["safetensors"][1])
@@ -903,9 +899,8 @@ def load_pipeline_from_original_AudioLDM2_ckpt(
 
     if original_config_file is None:
         original_config = DEFAULT_CONFIG
-        original_config = OmegaConf.create(original_config)
     else:
-        original_config = OmegaConf.load(original_config_file)
+        original_config = yaml.safe_load(original_config_file)
 
     if image_size is not None:
         original_config["model"]["params"]["unet_config"]["params"]["image_size"] = image_size
@@ -926,9 +921,9 @@ def load_pipeline_from_original_AudioLDM2_ckpt(
         if prediction_type is None:
             prediction_type = "epsilon"
 
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
 
     scheduler = DDIMScheduler(
         beta_end=beta_end,
@@ -1026,9 +1021,9 @@ def load_pipeline_from_original_AudioLDM2_ckpt(
     # Convert the GPT2 encoder model: AudioLDM2 uses the same configuration as the original GPT2 base model
     gpt2_config = GPT2Config.from_pretrained("gpt2")
     gpt2_model = GPT2Model(gpt2_config)
-    gpt2_model.config.max_new_tokens = (
-        original_config.model.params.cond_stage_config.crossattn_audiomae_generated.params.sequence_gen_length
-    )
+    gpt2_model.config.max_new_tokens = original_config["model"]["params"]["cond_stage_config"][
+        "crossattn_audiomae_generated"
+    ]["params"]["sequence_gen_length"]
 
     converted_gpt2_checkpoint = extract_sub_model(checkpoint, key_prefix="cond_stage_models.0.model.")
     gpt2_model.load_state_dict(converted_gpt2_checkpoint)
diff --git a/scripts/convert_original_audioldm_to_diffusers.py b/scripts/convert_original_audioldm_to_diffusers.py
index 940c74e9cd..79e8fcc1af 100644
--- a/scripts/convert_original_audioldm_to_diffusers.py
+++ b/scripts/convert_original_audioldm_to_diffusers.py
@@ -18,6 +18,7 @@ import argparse
 import re
 
 import torch
+import yaml
 from transformers import (
     AutoTokenizer,
     ClapTextConfig,
@@ -38,8 +39,6 @@ from diffusers import (
     PNDMScheduler,
     UNet2DConditionModel,
 )
-from diffusers.utils import is_omegaconf_available
-from diffusers.utils.import_utils import BACKENDS_MAPPING
 
 
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
@@ -215,45 +214,45 @@ def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a UNet config for diffusers based on the config of the original AudioLDM model.
     """
-    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
     cross_attention_dim = (
-        unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
+        unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels
     )
 
     class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
     projection_class_embeddings_input_dim = (
-        unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
+        unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None
     )
-    class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
+    class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
-        "out_channels": unet_params.out_channels,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
+        "layers_per_block": unet_params["num_res_blocks"],
         "cross_attention_dim": cross_attention_dim,
         "class_embed_type": class_embed_type,
         "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
@@ -269,24 +268,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     Creates a VAE config for diffusers based on the config of the original AudioLDM model. Compared to the original
     Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
-    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
         "scaling_factor": float(scaling_factor),
     }
     return config
@@ -295,9 +294,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
         beta_schedule="scaled_linear",
     )
     return schedular
@@ -668,17 +667,17 @@ def create_transformers_vocoder_config(original_config):
     """
     Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
     """
-    vocoder_params = original_config.model.params.vocoder_config.params
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
 
     config = {
-        "model_in_dim": vocoder_params.num_mels,
-        "sampling_rate": vocoder_params.sampling_rate,
-        "upsample_initial_channel": vocoder_params.upsample_initial_channel,
-        "upsample_rates": list(vocoder_params.upsample_rates),
-        "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes),
-        "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes),
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
         "resblock_dilation_sizes": [
-            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
         ],
         "normalize_before": False,
     }
@@ -818,11 +817,6 @@ def load_pipeline_from_original_audioldm_ckpt(
         return: An AudioLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
 
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if from_safetensors:
         from safetensors import safe_open
 
@@ -842,9 +836,8 @@ def load_pipeline_from_original_audioldm_ckpt(
 
     if original_config_file is None:
         original_config = DEFAULT_CONFIG
-        original_config = OmegaConf.create(original_config)
     else:
-        original_config = OmegaConf.load(original_config_file)
+        original_config = yaml.safe_load(original_config_file)
 
     if num_in_channels is not None:
         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
@@ -868,9 +861,9 @@ def load_pipeline_from_original_audioldm_ckpt(
     if image_size is None:
         image_size = 512
 
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
 
     scheduler = DDIMScheduler(
         beta_end=beta_end,
diff --git a/scripts/convert_original_musicldm_to_diffusers.py b/scripts/convert_original_musicldm_to_diffusers.py
index bbc2fc96f8..b7da888a06 100644
--- a/scripts/convert_original_musicldm_to_diffusers.py
+++ b/scripts/convert_original_musicldm_to_diffusers.py
@@ -18,6 +18,7 @@ import argparse
 import re
 
 import torch
+import yaml
 from transformers import (
     AutoFeatureExtractor,
     AutoTokenizer,
@@ -39,8 +40,6 @@ from diffusers import (
     PNDMScheduler,
     UNet2DConditionModel,
 )
-from diffusers.utils import is_omegaconf_available
-from diffusers.utils.import_utils import BACKENDS_MAPPING
 
 
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments
@@ -212,45 +211,45 @@ def create_unet_diffusers_config(original_config, image_size: int):
     """
     Creates a UNet config for diffusers based on the config of the original MusicLDM model.
     """
-    unet_params = original_config.model.params.unet_config.params
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
     cross_attention_dim = (
-        unet_params.cross_attention_dim if "cross_attention_dim" in unet_params else block_out_channels
+        unet_params["cross_attention_dim"] if "cross_attention_dim" in unet_params else block_out_channels
     )
 
     class_embed_type = "simple_projection" if "extra_film_condition_dim" in unet_params else None
     projection_class_embeddings_input_dim = (
-        unet_params.extra_film_condition_dim if "extra_film_condition_dim" in unet_params else None
+        unet_params["extra_film_condition_dim"] if "extra_film_condition_dim" in unet_params else None
     )
-    class_embeddings_concat = unet_params.extra_film_use_concat if "extra_film_use_concat" in unet_params else None
+    class_embeddings_concat = unet_params["extra_film_use_concat"] if "extra_film_use_concat" in unet_params else None
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
-        "out_channels": unet_params.out_channels,
+        "in_channels": unet_params["in_channels"],
+        "out_channels": unet_params["out_channels"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
+        "layers_per_block": unet_params["num_res_blocks"],
         "cross_attention_dim": cross_attention_dim,
         "class_embed_type": class_embed_type,
         "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
@@ -266,24 +265,24 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
     Creates a VAE config for diffusers based on the config of the original MusicLDM model. Compared to the original
     Stable Diffusion conversion, this function passes a *learnt* VAE scaling factor to the diffusers VAE.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
-    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config.model.params else 0.18215
+    scaling_factor = checkpoint["scale_factor"] if "scale_by_std" in original_config["model"]["params"] else 0.18215
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
         "scaling_factor": float(scaling_factor),
     }
     return config
@@ -292,9 +291,9 @@ def create_vae_diffusers_config(original_config, checkpoint, image_size: int):
 # Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.create_diffusers_schedular
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
         beta_schedule="scaled_linear",
     )
     return schedular
@@ -674,17 +673,17 @@ def create_transformers_vocoder_config(original_config):
     """
     Creates a config for transformers SpeechT5HifiGan based on the config of the vocoder model.
     """
-    vocoder_params = original_config.model.params.vocoder_config.params
+    vocoder_params = original_config["model"]["params"]["vocoder_config"]["params"]
 
     config = {
-        "model_in_dim": vocoder_params.num_mels,
-        "sampling_rate": vocoder_params.sampling_rate,
-        "upsample_initial_channel": vocoder_params.upsample_initial_channel,
-        "upsample_rates": list(vocoder_params.upsample_rates),
-        "upsample_kernel_sizes": list(vocoder_params.upsample_kernel_sizes),
-        "resblock_kernel_sizes": list(vocoder_params.resblock_kernel_sizes),
+        "model_in_dim": vocoder_params["num_mels"],
+        "sampling_rate": vocoder_params["sampling_rate"],
+        "upsample_initial_channel": vocoder_params["upsample_initial_channel"],
+        "upsample_rates": list(vocoder_params["upsample_rates"]),
+        "upsample_kernel_sizes": list(vocoder_params["upsample_kernel_sizes"]),
+        "resblock_kernel_sizes": list(vocoder_params["resblock_kernel_sizes"]),
         "resblock_dilation_sizes": [
-            list(resblock_dilation) for resblock_dilation in vocoder_params.resblock_dilation_sizes
+            list(resblock_dilation) for resblock_dilation in vocoder_params["resblock_dilation_sizes"]
         ],
         "normalize_before": False,
     }
@@ -823,12 +822,6 @@ def load_pipeline_from_original_MusicLDM_ckpt(
             If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
         return: An MusicLDMPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
     """
-
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if from_safetensors:
         from safetensors import safe_open
 
@@ -848,9 +841,8 @@ def load_pipeline_from_original_MusicLDM_ckpt(
 
     if original_config_file is None:
         original_config = DEFAULT_CONFIG
-        original_config = OmegaConf.create(original_config)
     else:
-        original_config = OmegaConf.load(original_config_file)
+        original_config = yaml.safe_load(original_config_file)
 
     if num_in_channels is not None:
         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
@@ -874,9 +866,9 @@ def load_pipeline_from_original_MusicLDM_ckpt(
     if image_size is None:
         image_size = 512
 
-    num_train_timesteps = original_config.model.params.timesteps
-    beta_start = original_config.model.params.linear_start
-    beta_end = original_config.model.params.linear_end
+    num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    beta_start = original_config["model"]["params"]["linear_start"]
+    beta_end = original_config["model"]["params"]["linear_end"]
 
     scheduler = DDIMScheduler(
         beta_end=beta_end,
diff --git a/scripts/convert_vae_pt_to_diffusers.py b/scripts/convert_vae_pt_to_diffusers.py
index a8ba48bc00..a4f967c94f 100644
--- a/scripts/convert_vae_pt_to_diffusers.py
+++ b/scripts/convert_vae_pt_to_diffusers.py
@@ -3,7 +3,7 @@ import io
 
 import requests
 import torch
-from omegaconf import OmegaConf
+import yaml
 
 from diffusers import AutoencoderKL
 from diffusers.pipelines.stable_diffusion.convert_from_ckpt import (
@@ -126,7 +126,7 @@ def vae_pt_to_vae_diffuser(
     )
     io_obj = io.BytesIO(r.content)
 
-    original_config = OmegaConf.load(io_obj)
+    original_config = yaml.safe_load(io_obj)
     image_size = 512
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if checkpoint_path.endswith("safetensors"):
diff --git a/scripts/convert_vq_diffusion_to_diffusers.py b/scripts/convert_vq_diffusion_to_diffusers.py
index 58ed2d93d5..7da6b40949 100644
--- a/scripts/convert_vq_diffusion_to_diffusers.py
+++ b/scripts/convert_vq_diffusion_to_diffusers.py
@@ -45,51 +45,45 @@ from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionSchedu
 from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
 
 
-try:
-    from omegaconf import OmegaConf
-except ImportError:
-    raise ImportError(
-        "OmegaConf is required to convert the VQ Diffusion checkpoints. Please install it with `pip install"
-        " OmegaConf`."
-    )
-
 # vqvae model
 
 PORTED_VQVAES = ["image_synthesis.modeling.codecs.image_codec.patch_vqgan.PatchVQGAN"]
 
 
 def vqvae_model_from_original_config(original_config):
-    assert original_config.target in PORTED_VQVAES, f"{original_config.target} has not yet been ported to diffusers."
+    assert (
+        original_config["target"] in PORTED_VQVAES
+    ), f"{original_config['target']} has not yet been ported to diffusers."
 
-    original_config = original_config.params
+    original_config = original_config["params"]
 
-    original_encoder_config = original_config.encoder_config.params
-    original_decoder_config = original_config.decoder_config.params
+    original_encoder_config = original_config["encoder_config"]["params"]
+    original_decoder_config = original_config["decoder_config"]["params"]
 
-    in_channels = original_encoder_config.in_channels
-    out_channels = original_decoder_config.out_ch
+    in_channels = original_encoder_config["in_channels"]
+    out_channels = original_decoder_config["out_ch"]
 
     down_block_types = get_down_block_types(original_encoder_config)
     up_block_types = get_up_block_types(original_decoder_config)
 
-    assert original_encoder_config.ch == original_decoder_config.ch
-    assert original_encoder_config.ch_mult == original_decoder_config.ch_mult
+    assert original_encoder_config["ch"] == original_decoder_config["ch"]
+    assert original_encoder_config["ch_mult"] == original_decoder_config["ch_mult"]
     block_out_channels = tuple(
-        [original_encoder_config.ch * a_ch_mult for a_ch_mult in original_encoder_config.ch_mult]
+        [original_encoder_config["ch"] * a_ch_mult for a_ch_mult in original_encoder_config["ch_mult"]]
     )
 
-    assert original_encoder_config.num_res_blocks == original_decoder_config.num_res_blocks
-    layers_per_block = original_encoder_config.num_res_blocks
+    assert original_encoder_config["num_res_blocks"] == original_decoder_config["num_res_blocks"]
+    layers_per_block = original_encoder_config["num_res_blocks"]
 
-    assert original_encoder_config.z_channels == original_decoder_config.z_channels
-    latent_channels = original_encoder_config.z_channels
+    assert original_encoder_config["z_channels"] == original_decoder_config["z_channels"]
+    latent_channels = original_encoder_config["z_channels"]
 
-    num_vq_embeddings = original_config.n_embed
+    num_vq_embeddings = original_config["n_embed"]
 
     # Hard coded value for ResnetBlock.GoupNorm(num_groups) in VQ-diffusion
     norm_num_groups = 32
 
-    e_dim = original_config.embed_dim
+    e_dim = original_config["embed_dim"]
 
     model = VQModel(
         in_channels=in_channels,
@@ -108,9 +102,9 @@ def vqvae_model_from_original_config(original_config):
 
 
 def get_down_block_types(original_encoder_config):
-    attn_resolutions = coerce_attn_resolutions(original_encoder_config.attn_resolutions)
-    num_resolutions = len(original_encoder_config.ch_mult)
-    resolution = coerce_resolution(original_encoder_config.resolution)
+    attn_resolutions = coerce_attn_resolutions(original_encoder_config["attn_resolutions"])
+    num_resolutions = len(original_encoder_config["ch_mult"])
+    resolution = coerce_resolution(original_encoder_config["resolution"])
 
     curr_res = resolution
     down_block_types = []
@@ -129,9 +123,9 @@ def get_down_block_types(original_encoder_config):
 
 
 def get_up_block_types(original_decoder_config):
-    attn_resolutions = coerce_attn_resolutions(original_decoder_config.attn_resolutions)
-    num_resolutions = len(original_decoder_config.ch_mult)
-    resolution = coerce_resolution(original_decoder_config.resolution)
+    attn_resolutions = coerce_attn_resolutions(original_decoder_config["attn_resolutions"])
+    num_resolutions = len(original_decoder_config["ch_mult"])
+    resolution = coerce_resolution(original_decoder_config["resolution"])
 
     curr_res = [r // 2 ** (num_resolutions - 1) for r in resolution]
     up_block_types = []
@@ -150,7 +144,7 @@ def get_up_block_types(original_decoder_config):
 
 
 def coerce_attn_resolutions(attn_resolutions):
-    attn_resolutions = OmegaConf.to_object(attn_resolutions)
+    attn_resolutions = list(attn_resolutions)
     attn_resolutions_ = []
     for ar in attn_resolutions:
         if isinstance(ar, (list, tuple)):
@@ -161,7 +155,6 @@ def coerce_attn_resolutions(attn_resolutions):
 
 
 def coerce_resolution(resolution):
-    resolution = OmegaConf.to_object(resolution)
     if isinstance(resolution, int):
         resolution = [resolution, resolution]  # H, W
     elif isinstance(resolution, (tuple, list)):
@@ -472,18 +465,18 @@ def transformer_model_from_original_config(
     original_diffusion_config, original_transformer_config, original_content_embedding_config
 ):
     assert (
-        original_diffusion_config.target in PORTED_DIFFUSIONS
-    ), f"{original_diffusion_config.target} has not yet been ported to diffusers."
+        original_diffusion_config["target"] in PORTED_DIFFUSIONS
+    ), f"{original_diffusion_config['target']} has not yet been ported to diffusers."
     assert (
-        original_transformer_config.target in PORTED_TRANSFORMERS
-    ), f"{original_transformer_config.target} has not yet been ported to diffusers."
+        original_transformer_config["target"] in PORTED_TRANSFORMERS
+    ), f"{original_transformer_config['target']} has not yet been ported to diffusers."
     assert (
-        original_content_embedding_config.target in PORTED_CONTENT_EMBEDDINGS
-    ), f"{original_content_embedding_config.target} has not yet been ported to diffusers."
+        original_content_embedding_config["target"] in PORTED_CONTENT_EMBEDDINGS
+    ), f"{original_content_embedding_config['target']} has not yet been ported to diffusers."
 
-    original_diffusion_config = original_diffusion_config.params
-    original_transformer_config = original_transformer_config.params
-    original_content_embedding_config = original_content_embedding_config.params
+    original_diffusion_config = original_diffusion_config["params"]
+    original_transformer_config = original_transformer_config["params"]
+    original_content_embedding_config = original_content_embedding_config["params"]
 
     inner_dim = original_transformer_config["n_embd"]
 
@@ -689,13 +682,11 @@ def transformer_feedforward_to_diffusers_checkpoint(checkpoint, *, diffusers_fee
 
 def read_config_file(filename):
     # The yaml file contains annotations that certain values should
-    # loaded as tuples. By default, OmegaConf will panic when reading
-    # these. Instead, we can manually read the yaml with the FullLoader and then
-    # construct the OmegaConf object.
+    # loaded as tuples.
     with open(filename) as f:
         original_config = yaml.load(f, FullLoader)
 
-    return OmegaConf.create(original_config)
+    return original_config
 
 
 # We take separate arguments for the vqvae because the ITHQ vqvae config file
@@ -792,9 +783,9 @@ if __name__ == "__main__":
 
     original_config = read_config_file(args.original_config_file).model
 
-    diffusion_config = original_config.params.diffusion_config
-    transformer_config = original_config.params.diffusion_config.params.transformer_config
-    content_embedding_config = original_config.params.diffusion_config.params.content_emb_config
+    diffusion_config = original_config["params"]["diffusion_config"]
+    transformer_config = original_config["params"]["diffusion_config"]["params"]["transformer_config"]
+    content_embedding_config = original_config["params"]["diffusion_config"]["params"]["content_emb_config"]
 
     pre_checkpoint = torch.load(args.checkpoint_path, map_location=checkpoint_map_location)
 
@@ -831,7 +822,7 @@ if __name__ == "__main__":
     # The learned embeddings are stored on the transformer in the original VQ-diffusion. We store them on a separate
     # model, so we pull them off the checkpoint before the checkpoint is deleted.
 
-    learnable_classifier_free_sampling_embeddings = diffusion_config.params.learnable_cf
+    learnable_classifier_free_sampling_embeddings = diffusion_config["params"].learnable_cf
 
     if learnable_classifier_free_sampling_embeddings:
         learned_classifier_free_sampling_embeddings_embeddings = checkpoint["transformer.empty_text_embed"]
diff --git a/scripts/convert_zero123_to_diffusers.py b/scripts/convert_zero123_to_diffusers.py
index bdcb2cd2e1..f016312b8b 100644
--- a/scripts/convert_zero123_to_diffusers.py
+++ b/scripts/convert_zero123_to_diffusers.py
@@ -14,6 +14,7 @@ $ python convert_zero123_to_diffusers.py \
 import argparse
 
 import torch
+import yaml
 from accelerate import init_empty_weights
 from accelerate.utils import set_module_tensor_to_device
 from pipeline_zero1to3 import CCProjection, Zero1to3StableDiffusionPipeline
@@ -38,51 +39,54 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     Creates a config for the diffusers based on the config of the LDM model.
     """
     if controlnet:
-        unet_params = original_config.model.params.control_stage_config.params
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
     else:
-        if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
-            unet_params = original_config.model.params.unet_config.params
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
         else:
-            unet_params = original_config.model.params.network_config.params
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
 
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    if unet_params.transformer_depth is not None:
+    if unet_params["transformer_depth"] is not None:
         transformer_layers_per_block = (
-            unet_params.transformer_depth
-            if isinstance(unet_params.transformer_depth, int)
-            else list(unet_params.transformer_depth)
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
         )
     else:
         transformer_layers_per_block = 1
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
     use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
     )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
-            head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
-            head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
 
     class_embed_type = None
     addition_embed_type = None
@@ -90,13 +94,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     projection_class_embeddings_input_dim = None
     context_dim = None
 
-    if unet_params.context_dim is not None:
+    if unet_params["context_dim"] is not None:
         context_dim = (
-            unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
         )
 
     if "num_classes" in unet_params:
-        if unet_params.num_classes == "sequential":
+        if unet_params["num_classes"] == "sequential":
             if context_dim in [2048, 1280]:
                 # SDXL
                 addition_embed_type = "text_time"
@@ -104,16 +110,16 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
             else:
                 class_embed_type = "projection"
             assert "adm_in_channels" in unet_params
-            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
         else:
-            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
+            raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params["num_classes"]}")
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
+        "in_channels": unet_params["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
+        "layers_per_block": unet_params["num_res_blocks"],
         "cross_attention_dim": context_dim,
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
@@ -125,9 +131,9 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     }
 
     if controlnet:
-        config["conditioning_channels"] = unet_params.hint_channels
+        config["conditioning_channels"] = unet_params["hint_channels"]
     else:
-        config["out_channels"] = unet_params.out_channels
+        config["out_channels"] = unet_params["out_channels"]
         config["up_block_types"] = tuple(up_block_types)
 
     return config
@@ -487,22 +493,22 @@ def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
     }
     return config
 
@@ -679,18 +685,16 @@ def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, ex
     del ckpt
     torch.cuda.empty_cache()
 
-    from omegaconf import OmegaConf
-
-    original_config = OmegaConf.load(original_config_file)
-    original_config.model.params.cond_stage_config.target.split(".")[-1]
+    original_config = yaml.safe_load(original_config_file)
+    original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
     num_in_channels = 8
     original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
     prediction_type = "epsilon"
     image_size = 256
-    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
+    num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000
 
-    beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
-    beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
+    beta_start = getattr(original_config["model"]["params"], "linear_start", None) or 0.02
+    beta_end = getattr(original_config["model"]["params"], "linear_end", None) or 0.085
     scheduler = DDIMScheduler(
         beta_end=beta_end,
         beta_schedule="scaled_linear",
@@ -721,10 +725,10 @@ def convert_from_original_zero123_ckpt(checkpoint_path, original_config_file, ex
 
     if (
         "model" in original_config
-        and "params" in original_config.model
-        and "scale_factor" in original_config.model.params
+        and "params" in original_config["model"]
+        and "scale_factor" in original_config["model"]["params"]
     ):
-        vae_scaling_factor = original_config.model.params.scale_factor
+        vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
     else:
         vae_scaling_factor = 0.18215  # default SD scaling factor
 
diff --git a/setup.py b/setup.py
index 177c918d38..bb0f378170 100644
--- a/setup.py
+++ b/setup.py
@@ -110,7 +110,6 @@ _deps = [
     "note_seq",
     "librosa",
     "numpy",
-    "omegaconf",
     "parameterized",
     "peft>=0.6.0",
     "protobuf>=3.20.3,<4",
@@ -213,7 +212,6 @@ extras["test"] = deps_list(
     "invisible-watermark",
     "k-diffusion",
     "librosa",
-    "omegaconf",
     "parameterized",
     "pytest",
     "pytest-timeout",
diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py
index 03e8fe7a0a..e92a486bff 100644
--- a/src/diffusers/dependency_versions_table.py
+++ b/src/diffusers/dependency_versions_table.py
@@ -22,7 +22,6 @@ deps = {
     "note_seq": "note_seq",
     "librosa": "librosa",
     "numpy": "numpy",
-    "omegaconf": "omegaconf",
     "parameterized": "parameterized",
     "peft": "peft>=0.6.0",
     "protobuf": "protobuf>=3.20.3,<4",
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
index 039b6b910a..f2ac58cf93 100644
--- a/src/diffusers/loaders/ip_adapter.py
+++ b/src/diffusers/loaders/ip_adapter.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+from pathlib import Path
 from typing import Dict, Union
 
 import torch
@@ -138,7 +138,7 @@ class IPAdapterMixin:
                 logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
                 image_encoder = CLIPVisionModelWithProjection.from_pretrained(
                     pretrained_model_name_or_path_or_dict,
-                    subfolder=os.path.join(subfolder, "image_encoder"),
+                    subfolder=Path(subfolder, "image_encoder").as_posix(),
                 ).to(self.device, dtype=self.dtype)
                 self.image_encoder = image_encoder
                 self.register_to_config(image_encoder=["transformers", "CLIPVisionModelWithProjection"])
diff --git a/src/diffusers/loaders/lora.py b/src/diffusers/loaders/lora.py
index 424e95f084..922c98b98b 100644
--- a/src/diffusers/loaders/lora.py
+++ b/src/diffusers/loaders/lora.py
@@ -14,6 +14,7 @@
 import inspect
 import os
 from contextlib import nullcontext
+from pathlib import Path
 from typing import Callable, Dict, List, Optional, Union
 
 import safetensors
@@ -581,7 +582,6 @@ class LoraLoaderMixin:
                     lora_config_kwargs = get_peft_kwargs(
                         rank, network_alphas, text_encoder_lora_state_dict, is_unet=False
                     )
-
                     lora_config = LoraConfig(**lora_config_kwargs)
 
                     # adapter_name
@@ -961,8 +961,9 @@ class LoraLoaderMixin:
             else:
                 weight_name = LORA_WEIGHT_NAME
 
-        save_function(state_dict, os.path.join(save_directory, weight_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
+        save_path = Path(save_directory, weight_name).as_posix()
+        save_function(state_dict, save_path)
+        logger.info(f"Model weights saved in {save_path}")
 
     def unload_lora_weights(self):
         """
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index b0fe790c22..0fb4637dab 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -13,11 +13,13 @@
 # limitations under the License.
 
 import inspect
+import math
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+import torch.fft as fft
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
@@ -36,6 +38,7 @@ from ...schedulers import (
 from ...utils import (
     USE_PEFT_BACKEND,
     BaseOutput,
+    deprecate,
     logging,
     replace_example_docstring,
     scale_lora_layers,
@@ -79,6 +82,71 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
     return outputs
 
 
+def _get_freeinit_freq_filter(
+    shape: Tuple[int, ...],
+    device: Union[str, torch.dtype],
+    filter_type: str,
+    order: float,
+    spatial_stop_frequency: float,
+    temporal_stop_frequency: float,
+) -> torch.Tensor:
+    r"""Returns the FreeInit filter based on filter type and other input conditions."""
+
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+
+    if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+        return mask
+
+    if filter_type == "butterworth":
+
+        def retrieve_mask(x):
+            return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+    elif filter_type == "gaussian":
+
+        def retrieve_mask(x):
+            return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+    elif filter_type == "ideal":
+
+        def retrieve_mask(x):
+            return 1 if x <= spatial_stop_frequency * 2 else 0
+    else:
+        raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (
+                    ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / T - 1)) ** 2
+                    + (2 * h / H - 1) ** 2
+                    + (2 * w / W - 1) ** 2
+                )
+                mask[..., t, h, w] = retrieve_mask(d_square)
+
+    return mask.to(device)
+
+
+def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> torch.Tensor:
+    r"""Noise reinitialization."""
+    # FFT
+    x_freq = fft.fftn(x, dim=(-3, -2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+    noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+    noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+    # frequency mix
+    HPF = 1 - LPF
+    x_freq_low = x_freq * LPF
+    noise_freq_high = noise_freq * HPF
+    x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+
+    # IFFT
+    x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+    x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+    return x_mixed
+
+
 @dataclass
 class AnimateDiffPipelineOutput(BaseOutput):
     frames: Union[torch.Tensor, np.ndarray]
@@ -115,6 +183,7 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
 
     model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
     _optional_components = ["feature_extractor", "image_encoder"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
 
     def __init__(
         self,
@@ -442,6 +511,58 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         """Disables the FreeU mechanism if enabled."""
         self.unet.disable_freeu()
 
+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+        generator: torch.Generator = None,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                FreeInit low pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                the original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                the original implementation.
+            generator (`torch.Generator`, *optional*, defaults to `0.25`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                FreeInit generation deterministic.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+        self._free_init_generator = generator
+
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -539,6 +660,185 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
+    def _denoise_loop(
+        self,
+        timesteps,
+        num_inference_steps,
+        do_classifier_free_guidance,
+        guidance_scale,
+        num_warmup_steps,
+        prompt_embeds,
+        negative_prompt_embeds,
+        latents,
+        cross_attention_kwargs,
+        added_cond_kwargs,
+        extra_step_kwargs,
+        callback,
+        callback_steps,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+    ):
+        """Denoising loop for AnimateDiff."""
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        return latents
+
+    def _free_init_loop(
+        self,
+        height,
+        width,
+        num_frames,
+        num_channels_latents,
+        batch_size,
+        num_videos_per_prompt,
+        denoise_args,
+        device,
+    ):
+        """Denoising loop for AnimateDiff using FreeInit noise reinitialization technique."""
+
+        latents = denoise_args.get("latents")
+        prompt_embeds = denoise_args.get("prompt_embeds")
+        timesteps = denoise_args.get("timesteps")
+        num_inference_steps = denoise_args.get("num_inference_steps")
+
+        latent_shape = (
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_filter_shape = (
+            1,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_freq_filter = _get_freeinit_freq_filter(
+            shape=free_init_filter_shape,
+            device=device,
+            filter_type=self._free_init_method,
+            order=self._free_init_order,
+            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+        )
+
+        with self.progress_bar(total=self._free_init_num_iters) as free_init_progress_bar:
+            for i in range(self._free_init_num_iters):
+                # For the first FreeInit iteration, the original latent is used without modification.
+                # Subsequent iterations apply the noise reinitialization technique.
+                if i == 0:
+                    initial_noise = latents.detach().clone()
+                else:
+                    current_diffuse_timestep = (
+                        self.scheduler.config.num_train_timesteps - 1
+                    )  # diffuse to t=999 noise level
+                    diffuse_timesteps = torch.full((batch_size,), current_diffuse_timestep).long()
+                    z_T = self.scheduler.add_noise(
+                        original_samples=latents, noise=initial_noise, timesteps=diffuse_timesteps.to(device)
+                    ).to(dtype=torch.float32)
+                    z_rand = randn_tensor(
+                        shape=latent_shape,
+                        generator=self._free_init_generator,
+                        device=device,
+                        dtype=torch.float32,
+                    )
+                    latents = _freq_mix_3d(z_T, z_rand, LPF=free_init_freq_filter)
+                    latents = latents.to(prompt_embeds.dtype)
+
+                # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+                if self._free_init_use_fast_sampling:
+                    current_num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (i + 1))
+                    self.scheduler.set_timesteps(current_num_inference_steps, device=device)
+                    timesteps = self.scheduler.timesteps
+                    denoise_args.update({"timesteps": timesteps, "num_inference_steps": current_num_inference_steps})
+
+                num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+                denoise_args.update({"latents": latents, "num_warmup_steps": num_warmup_steps})
+                latents = self._denoise_loop(**denoise_args)
+
+                free_init_progress_bar.update()
+
+        return latents
+
+    def _retrieve_video_frames(self, latents, output_type, return_dict):
+        """Helper function to handle latents to output conversion."""
+        if output_type == "latent":
+            return AnimateDiffPipelineOutput(frames=latents)
+
+        video_tensor = self.decode_latents(latents)
+
+        if output_type == "pt":
+            video = video_tensor
+        else:
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -559,10 +859,11 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: Optional[int] = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
         clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
     ):
         r"""
         The call function to the pipeline for generation.
@@ -603,25 +904,30 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                 Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
                 Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                 of a plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
             cross_attention_kwargs (`dict`, *optional*):
                 A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                 [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
             clip_skip (`int`, *optional*):
                 Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                 the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
         Examples:
 
         Returns:
@@ -629,6 +935,23 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
                 If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -637,9 +960,20 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
 
         # 1. Check inputs. Raise error if not correct
         self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
         )
 
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
@@ -649,30 +983,26 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             batch_size = prompt_embeds.shape[0]
 
         device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
 
         # 3. Encode input prompt
         text_encoder_lora_scale = (
-            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
         )
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             device,
             num_videos_per_prompt,
-            do_classifier_free_guidance,
+            self.do_classifier_free_guidance,
             negative_prompt,
             prompt_embeds=prompt_embeds,
             negative_prompt_embeds=negative_prompt_embeds,
             lora_scale=text_encoder_lora_scale,
-            clip_skip=clip_skip,
+            clip_skip=self.clip_skip,
         )
         # For classifier free guidance, we need to do two forward passes.
         # Here we concatenate the unconditional and text embeddings into a single batch
         # to avoid doing two forward passes
-        if do_classifier_free_guidance:
+        if self.do_classifier_free_guidance:
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
         if ip_adapter_image is not None:
@@ -680,12 +1010,13 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
             image_embeds, negative_image_embeds = self.encode_image(
                 ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
             )
-            if do_classifier_free_guidance:
+            if self.do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
         timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
@@ -703,55 +1034,47 @@ class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdap
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-        # 7 Add image embeds for IP-Adapter
+
+        # 7. Add image embeds for IP-Adapter
         added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
-        # Denoising loop
+        # 8. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+        denoise_args = {
+            "timesteps": timesteps,
+            "num_inference_steps": num_inference_steps,
+            "do_classifier_free_guidance": self.do_classifier_free_guidance,
+            "guidance_scale": guidance_scale,
+            "num_warmup_steps": num_warmup_steps,
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "latents": latents,
+            "cross_attention_kwargs": self.cross_attention_kwargs,
+            "added_cond_kwargs": added_cond_kwargs,
+            "extra_step_kwargs": extra_step_kwargs,
+            "callback": callback,
+            "callback_steps": callback_steps,
+            "callback_on_step_end": callback_on_step_end,
+            "callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
+        }
 
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
-                ).sample
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if output_type == "latent":
-            return AnimateDiffPipelineOutput(frames=latents)
-
-        # Post-processing
-        video_tensor = self.decode_latents(latents)
-
-        if output_type == "pt":
-            video = video_tensor
+        if self.free_init_enabled:
+            latents = self._free_init_loop(
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                num_channels_latents=num_channels_latents,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                denoise_args=denoise_args,
+                device=device,
+            )
         else:
-            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+            latents = self._denoise_loop(**denoise_args)
 
-        # Offload all models
+        video = self._retrieve_video_frames(latents, output_type, return_dict)
+
+        # 9. Offload all models
         self.maybe_free_model_hooks()
 
-        if not return_dict:
-            return (video,)
-
-        return AnimateDiffPipelineOutput(frames=video)
+        return video
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 6bdc281ef8..6cd1658c59 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -603,15 +603,6 @@ class StableDiffusionControlNetPipeline(
                     f" {negative_prompt_embeds.shape}."
                 )
 
-        # `prompt` needs more sophisticated handling when there are multiple
-        # conditionings.
-        if isinstance(self.controlnet, MultiControlNetModel):
-            if isinstance(prompt, list):
-                logger.warning(
-                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
-                    " prompts. The conditionings will be fixed across the prompts."
-                )
-
         # Check `image`
         is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
             self.controlnet, torch._dynamo.eval_frame.OptimizedModule
@@ -633,7 +624,13 @@ class StableDiffusionControlNetPipeline(
             # When `image` is a nested list:
             # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
             elif any(isinstance(i, list) for i in image):
-                raise ValueError("A single batch of multiple conditionings is not supported at the moment.")
+                transposed_image = [list(t) for t in zip(*image)]
+                if len(transposed_image) != len(self.controlnet.nets):
+                    raise ValueError(
+                        f"For multiple controlnets: if you pass`image` as a list of list, each sublist must have the same length as the number of controlnets, but the sublists in `image` got {len(transposed_image)} images and {len(self.controlnet.nets)} ControlNets."
+                    )
+                for image_ in transposed_image:
+                    self.check_image(image_, prompt, prompt_embeds)
             elif len(image) != len(self.controlnet.nets):
                 raise ValueError(
                     f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
@@ -659,7 +656,10 @@ class StableDiffusionControlNetPipeline(
         ):
             if isinstance(controlnet_conditioning_scale, list):
                 if any(isinstance(i, list) for i in controlnet_conditioning_scale):
-                    raise ValueError("A single batch of multiple conditionings is not supported at the moment.")
+                    raise ValueError(
+                        "A single batch of varying conditioning scale settings (e.g. [[1.0, 0.5], [0.2, 0.8]]) is not supported at the moment. "
+                        "The conditioning scale must be fixed across the batch."
+                    )
             elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
                 self.controlnet.nets
             ):
@@ -906,7 +906,9 @@ class StableDiffusionControlNetPipeline(
                 accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
                 and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
                 `init`, images must be passed as a list such that each element of the list can be correctly batched for
-                input to a single ControlNet.
+                input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet,
+                each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets,
+                where a list of image lists can be passed to batch for each prompt and each ControlNet.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
@@ -1105,6 +1107,11 @@ class StableDiffusionControlNetPipeline(
         elif isinstance(controlnet, MultiControlNetModel):
             images = []
 
+            # Nested lists as ControlNet condition
+            if isinstance(image[0], list):
+                # Transpose the nested image list
+                image = [list(t) for t in zip(*image)]
+
             for image_ in image:
                 image_ = self.prepare_image(
                     image=image_,
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 2083a6391c..6e00134591 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPV
 
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -1087,7 +1087,10 @@ class StableDiffusionControlNetImg2ImgPipeline(
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
 
         if ip_adapter_image is not None:
-            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
             if self.do_classifier_free_guidance:
                 image_embeds = torch.cat([negative_image_embeds, image_embeds])
 
diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
index 5aa23252b8..1ba06f811a 100644
--- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
+++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -21,6 +21,7 @@ from typing import Dict, Optional, Union
 
 import requests
 import torch
+import yaml
 from transformers import (
     AutoFeatureExtractor,
     BertTokenizerFast,
@@ -50,8 +51,7 @@ from ...schedulers import (
     PNDMScheduler,
     UnCLIPScheduler,
 )
-from ...utils import is_accelerate_available, is_omegaconf_available, logging
-from ...utils.import_utils import BACKENDS_MAPPING
+from ...utils import is_accelerate_available, logging
 from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..paint_by_example import PaintByExampleImageEncoder
 from ..pipeline_utils import DiffusionPipeline
@@ -237,51 +237,54 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     Creates a config for the diffusers based on the config of the LDM model.
     """
     if controlnet:
-        unet_params = original_config.model.params.control_stage_config.params
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
     else:
-        if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
-            unet_params = original_config.model.params.unet_config.params
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
         else:
-            unet_params = original_config.model.params.network_config.params
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
 
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
 
-    block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
 
     down_block_types = []
     resolution = 1
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
         down_block_types.append(block_type)
         if i != len(block_out_channels) - 1:
             resolution *= 2
 
     up_block_types = []
     for i in range(len(block_out_channels)):
-        block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
         up_block_types.append(block_type)
         resolution //= 2
 
-    if unet_params.transformer_depth is not None:
+    if unet_params["transformer_depth"] is not None:
         transformer_layers_per_block = (
-            unet_params.transformer_depth
-            if isinstance(unet_params.transformer_depth, int)
-            else list(unet_params.transformer_depth)
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
         )
     else:
         transformer_layers_per_block = 1
 
-    vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
 
-    head_dim = unet_params.num_heads if "num_heads" in unet_params else None
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
     use_linear_projection = (
-        unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
     )
     if use_linear_projection:
         # stable diffusion 2-base-512 and 2-768
         if head_dim is None:
-            head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
-            head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
 
     class_embed_type = None
     addition_embed_type = None
@@ -289,13 +292,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     projection_class_embeddings_input_dim = None
     context_dim = None
 
-    if unet_params.context_dim is not None:
+    if unet_params["context_dim"] is not None:
         context_dim = (
-            unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
         )
 
     if "num_classes" in unet_params:
-        if unet_params.num_classes == "sequential":
+        if unet_params["num_classes"] == "sequential":
             if context_dim in [2048, 1280]:
                 # SDXL
                 addition_embed_type = "text_time"
@@ -303,14 +308,14 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
             else:
                 class_embed_type = "projection"
             assert "adm_in_channels" in unet_params
-            projection_class_embeddings_input_dim = unet_params.adm_in_channels
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
 
     config = {
         "sample_size": image_size // vae_scale_factor,
-        "in_channels": unet_params.in_channels,
+        "in_channels": unet_params["in_channels"],
         "down_block_types": tuple(down_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "layers_per_block": unet_params.num_res_blocks,
+        "layers_per_block": unet_params["num_res_blocks"],
         "cross_attention_dim": context_dim,
         "attention_head_dim": head_dim,
         "use_linear_projection": use_linear_projection,
@@ -322,15 +327,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa
     }
 
     if "disable_self_attentions" in unet_params:
-        config["only_cross_attention"] = unet_params.disable_self_attentions
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
 
-    if "num_classes" in unet_params and isinstance(unet_params.num_classes, int):
-        config["num_class_embeds"] = unet_params.num_classes
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
 
     if controlnet:
-        config["conditioning_channels"] = unet_params.hint_channels
+        config["conditioning_channels"] = unet_params["hint_channels"]
     else:
-        config["out_channels"] = unet_params.out_channels
+        config["out_channels"] = unet_params["out_channels"]
         config["up_block_types"] = tuple(up_block_types)
 
     return config
@@ -340,38 +345,38 @@ def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
-    vae_params = original_config.model.params.first_stage_config.params.ddconfig
-    _ = original_config.model.params.first_stage_config.params.embed_dim
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
 
-    block_out_channels = [vae_params.ch * mult for mult in vae_params.ch_mult]
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
     down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
     up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
 
     config = {
         "sample_size": image_size,
-        "in_channels": vae_params.in_channels,
-        "out_channels": vae_params.out_ch,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
         "down_block_types": tuple(down_block_types),
         "up_block_types": tuple(up_block_types),
         "block_out_channels": tuple(block_out_channels),
-        "latent_channels": vae_params.z_channels,
-        "layers_per_block": vae_params.num_res_blocks,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
     }
     return config
 
 
 def create_diffusers_schedular(original_config):
     schedular = DDIMScheduler(
-        num_train_timesteps=original_config.model.params.timesteps,
-        beta_start=original_config.model.params.linear_start,
-        beta_end=original_config.model.params.linear_end,
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
         beta_schedule="scaled_linear",
     )
     return schedular
 
 
 def create_ldm_bert_config(original_config):
-    bert_params = original_config.model.params.cond_stage_config.params
+    bert_params = original_config["model"]["params"]["cond_stage_config"]["params"]
     config = LDMBertConfig(
         d_model=bert_params.n_embed,
         encoder_layers=bert_params.n_layer,
@@ -1006,9 +1011,9 @@ def stable_unclip_image_encoder(original_config, local_files_only=False):
     encoders.
     """
 
-    image_embedder_config = original_config.model.params.embedder_config
+    image_embedder_config = original_config["model"]["params"]["embedder_config"]
 
-    sd_clip_image_embedder_class = image_embedder_config.target
+    sd_clip_image_embedder_class = image_embedder_config["target"]
     sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
 
     if sd_clip_image_embedder_class == "ClipImageEmbedder":
@@ -1047,8 +1052,8 @@ def stable_unclip_image_noising_components(
 
     If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
     """
-    noise_aug_config = original_config.model.params.noise_aug_config
-    noise_aug_class = noise_aug_config.target
+    noise_aug_config = original_config["model"]["params"]["noise_aug_config"]
+    noise_aug_class = noise_aug_config["target"]
     noise_aug_class = noise_aug_class.split(".")[-1]
 
     if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
@@ -1245,11 +1250,6 @@ def download_from_original_stable_diffusion_ckpt(
     if prediction_type == "v-prediction":
         prediction_type = "v_prediction"
 
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if isinstance(checkpoint_path_or_dict, str):
         if from_safetensors:
             from safetensors.torch import load_file as safe_load
@@ -1317,19 +1317,22 @@ def download_from_original_stable_diffusion_ckpt(
 
         if config_url is not None:
             original_config_file = BytesIO(requests.get(config_url).content)
+        else:
+            with open(original_config_file, "r") as f:
+                original_config_file = f.read()
 
-    original_config = OmegaConf.load(original_config_file)
+    original_config = yaml.safe_load(original_config_file)
 
     # Convert the text model.
     if (
         model_type is None
-        and "cond_stage_config" in original_config.model.params
-        and original_config.model.params.cond_stage_config is not None
+        and "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
     ):
-        model_type = original_config.model.params.cond_stage_config.target.split(".")[-1]
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
         logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
-    elif model_type is None and original_config.model.params.network_config is not None:
-        if original_config.model.params.network_config.params.context_dim == 2048:
+    elif model_type is None and original_config["model"]["params"]["network_config"] is not None:
+        if original_config["model"]["params"]["network_config"]["params"]["context_dim"] == 2048:
             model_type = "SDXL"
         else:
             model_type = "SDXL-Refiner"
@@ -1354,7 +1357,7 @@ def download_from_original_stable_diffusion_ckpt(
     elif num_in_channels is None:
         num_in_channels = 4
 
-    if "unet_config" in original_config.model.params:
+    if "unet_config" in original_config["model"]["params"]:
         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
 
     if (
@@ -1375,13 +1378,16 @@ def download_from_original_stable_diffusion_ckpt(
         if image_size is None:
             image_size = 512
 
-    if controlnet is None and "control_stage_config" in original_config.model.params:
+    if controlnet is None and "control_stage_config" in original_config["model"]["params"]:
         path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
         controlnet = convert_controlnet_checkpoint(
             checkpoint, original_config, path, image_size, upcast_attention, extract_ema
         )
 
-    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000
+    if "timesteps" in original_config["model"]["params"]:
+        num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    else:
+        num_train_timesteps = 1000
 
     if model_type in ["SDXL", "SDXL-Refiner"]:
         scheduler_dict = {
@@ -1400,8 +1406,15 @@ def download_from_original_stable_diffusion_ckpt(
         scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
         scheduler_type = "euler"
     else:
-        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
-        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
+        if "linear_start" in original_config["model"]["params"]:
+            beta_start = original_config["model"]["params"]["linear_start"]
+        else:
+            beta_start = 0.02
+
+        if "linear_end" in original_config["model"]["params"]:
+            beta_end = original_config["model"]["params"]["linear_end"]
+        else:
+            beta_end = 0.085
         scheduler = DDIMScheduler(
             beta_end=beta_end,
             beta_schedule="scaled_linear",
@@ -1435,7 +1448,7 @@ def download_from_original_stable_diffusion_ckpt(
         raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
 
     if pipeline_class == StableDiffusionUpscalePipeline:
-        image_size = original_config.model.params.unet_config.params.image_size
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
 
     # Convert the UNet2DConditionModel model.
     unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
@@ -1464,10 +1477,10 @@ def download_from_original_stable_diffusion_ckpt(
 
         if (
             "model" in original_config
-            and "params" in original_config.model
-            and "scale_factor" in original_config.model.params
+            and "params" in original_config["model"]
+            and "scale_factor" in original_config["model"]["params"]
         ):
-            vae_scaling_factor = original_config.model.params.scale_factor
+            vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
         else:
             vae_scaling_factor = 0.18215  # default SD scaling factor
 
@@ -1803,11 +1816,6 @@ def download_controlnet_from_original_ckpt(
     use_linear_projection: Optional[bool] = None,
     cross_attention_dim: Optional[bool] = None,
 ) -> DiffusionPipeline:
-    if not is_omegaconf_available():
-        raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-    from omegaconf import OmegaConf
-
     if from_safetensors:
         from safetensors import safe_open
 
@@ -1827,12 +1835,12 @@ def download_controlnet_from_original_ckpt(
     while "state_dict" in checkpoint:
         checkpoint = checkpoint["state_dict"]
 
-    original_config = OmegaConf.load(original_config_file)
+    original_config = yaml.safe_load(original_config_file)
 
     if num_in_channels is not None:
         original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
 
-    if "control_stage_config" not in original_config.model.params:
+    if "control_stage_config" not in original_config["model"]["params"]:
         raise ValueError("`control_stage_config` not present in original config")
 
     controlnet = convert_controlnet_checkpoint(
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index b06363cffd..b653d8e9f7 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -858,7 +858,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
         )
 
         # 4. Preprocess image
-        image = self.image_processor.preprocess(image).to(device)
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
 
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
diff --git a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index e5360d37c6..56f7269130 100644
--- a/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -52,6 +52,9 @@ def tensor2vid(video: torch.Tensor, processor, output_type="np"):
 
         outputs.append(batch_output)
 
+    if output_type == "np":
+        return np.stack(outputs)
+
     return outputs
 
 
diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py
index 9fb6fad3a2..596e5c4868 100644
--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -1,19 +1,28 @@
 import contextlib
 import copy
 import random
-from typing import Any, Dict, Iterable, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union
 
 import numpy as np
 import torch
 from torchvision import transforms
 
 from .models import UNet2DConditionModel
-from .utils import deprecate, is_transformers_available
+from .utils import (
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_peft,
+    deprecate,
+    is_peft_available,
+    is_transformers_available,
+)
 
 
 if is_transformers_available():
     import transformers
 
+if is_peft_available():
+    from peft import set_peft_model_state_dict
+
 
 def set_seed(seed: int):
     """
@@ -112,6 +121,35 @@ def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]:
     return lora_state_dict
 
 
+def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32):
+    if not isinstance(model, list):
+        model = [model]
+    for m in model:
+        for param in m.parameters():
+            # only upcast trainable parameters into fp32
+            if param.requires_grad:
+                param.data = param.to(dtype)
+
+
+def _set_state_dict_into_text_encoder(
+    lora_state_dict: Dict[str, torch.Tensor], prefix: str, text_encoder: torch.nn.Module
+):
+    """
+    Sets the `lora_state_dict` into `text_encoder` coming from `transformers`.
+
+    Args:
+        lora_state_dict: The state dictionary to be set.
+        prefix: String identifier to retrieve the portion of the state dict that belongs to `text_encoder`.
+        text_encoder: Where the `lora_state_dict` is to be set.
+    """
+
+    text_encoder_state_dict = {
+        f'{k.replace(prefix, "")}': v for k, v in lora_state_dict.items() if k.startswith(prefix)
+    }
+    text_encoder_state_dict = convert_state_dict_to_peft(convert_state_dict_to_diffusers(text_encoder_state_dict))
+    set_peft_model_state_dict(text_encoder, text_encoder_state_dict, adapter_name="default")
+
+
 # Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
 class EMAModel:
     """
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 54fde3e1f7..667f1fe5e2 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -67,7 +67,6 @@ from .import_utils import (
     is_k_diffusion_version,
     is_librosa_available,
     is_note_seq_available,
-    is_omegaconf_available,
     is_onnx_available,
     is_peft_available,
     is_scipy_available,
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
index b3278af2f6..ac1565023b 100644
--- a/src/diffusers/utils/import_utils.py
+++ b/src/diffusers/utils/import_utils.py
@@ -223,12 +223,6 @@ try:
 except importlib_metadata.PackageNotFoundError:
     _wandb_available = False
 
-_omegaconf_available = importlib.util.find_spec("omegaconf") is not None
-try:
-    _omegaconf_version = importlib_metadata.version("omegaconf")
-    logger.debug(f"Successfully imported omegaconf version {_omegaconf_version}")
-except importlib_metadata.PackageNotFoundError:
-    _omegaconf_available = False
 
 _tensorboard_available = importlib.util.find_spec("tensorboard")
 try:
@@ -345,10 +339,6 @@ def is_wandb_available():
     return _wandb_available
 
 
-def is_omegaconf_available():
-    return _omegaconf_available
-
-
 def is_tensorboard_available():
     return _tensorboard_available
 
@@ -449,12 +439,6 @@ WANDB_IMPORT_ERROR = """
 install wandb`
 """
 
-# docstyle-ignore
-OMEGACONF_IMPORT_ERROR = """
-{0} requires the omegaconf library but it was not found in your environment. You can install it with pip: `pip
-install omegaconf`
-"""
-
 # docstyle-ignore
 TENSORBOARD_IMPORT_ERROR = """
 {0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip
@@ -506,7 +490,6 @@ BACKENDS_MAPPING = OrderedDict(
         ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
         ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
         ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
-        ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)),
         ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
         ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
         ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index df1a4fc420..86e31eb688 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -137,7 +137,7 @@ def get_tests_dir(append_path=None):
         tests_dir = os.path.dirname(tests_dir)
 
     if append_path:
-        return os.path.join(tests_dir, append_path)
+        return Path(tests_dir, append_path).as_posix()
     else:
         return tests_dir
 
@@ -335,10 +335,9 @@ def require_python39_or_higher(test_case):
 
 def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
     if isinstance(arry, str):
-        # local_path = "/home/patrick_huggingface_co/"
         if local_path is not None:
             # local_path can be passed to correct images of tests
-            return os.path.join(local_path, "/".join([arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]]))
+            return Path(local_path, arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]).as_posix()
         elif arry.startswith("http://") or arry.startswith("https://"):
             response = requests.get(arry)
             response.raise_for_status()
@@ -520,10 +519,10 @@ def export_to_video(video_frames: List[np.ndarray], output_video_path: str = Non
 
 
 def load_hf_numpy(path) -> np.ndarray:
-    if not path.startswith("http://") or path.startswith("https://"):
-        path = os.path.join(
-            "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main", urllib.parse.quote(path)
-        )
+    base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main"
+
+    if not path.startswith("http://") and not path.startswith("https://"):
+        path = os.path.join(base_url, urllib.parse.quote(path))
 
     return load_numpy(path)
 
diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py
index 633ed9fc23..44cb730a95 100644
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -38,8 +38,8 @@ class AnimateDiffPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
             "generator",
             "latents",
             "return_dict",
-            "callback",
-            "callback_steps",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
         ]
     )
 
@@ -233,6 +233,43 @@ class AnimateDiffPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
         inputs["prompt_embeds"] = torch.randn((1, 4, 32), device=torch_device)
         pipe(**inputs)
 
+    def test_free_init(self):
+        components = self.get_dummy_components()
+        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
+        pipe.set_progress_bar_config(disable=None)
+        pipe.to(torch_device)
+
+        inputs_normal = self.get_dummy_inputs(torch_device)
+        frames_normal = pipe(**inputs_normal).frames[0]
+
+        free_init_generator = torch.Generator(device=torch_device).manual_seed(0)
+        pipe.enable_free_init(
+            num_iters=2,
+            use_fast_sampling=True,
+            method="butterworth",
+            order=4,
+            spatial_stop_frequency=0.25,
+            temporal_stop_frequency=0.25,
+            generator=free_init_generator,
+        )
+        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
+        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
+
+        pipe.disable_free_init()
+        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
+        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
+
+        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
+        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
+        self.assertGreater(
+            sum_enabled, 1e2, "Enabling of FreeInit should lead to results different from the default pipeline results"
+        )
+        self.assertLess(
+            max_diff_disabled,
+            1e-4,
+            "Disabling of FreeInit should lead to results similar to the default pipeline results",
+        )
+
     @unittest.skipIf(
         torch_device != "cuda" or not is_xformers_available(),
         reason="XFormers attention is only available with CUDA and `xformers` installed",
diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py
index ce86933430..c034a9b68b 100644
--- a/tests/pipelines/controlnet/test_controlnet.py
+++ b/tests/pipelines/controlnet/test_controlnet.py
@@ -460,6 +460,33 @@ class StableDiffusionMultiControlNetPipelineFastTests(
             except NotImplementedError:
                 pass
 
+    def test_inference_multiple_prompt_input(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionControlNetPipeline(**components)
+        sd_pipe = sd_pipe.to(torch_device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"], inputs["prompt"]]
+        inputs["image"] = [inputs["image"], inputs["image"]]
+        output = sd_pipe(**inputs)
+        image = output.images
+
+        assert image.shape == (2, 64, 64, 3)
+
+        image_1, image_2 = image
+        # make sure that the outputs are different
+        assert np.sum(np.abs(image_1 - image_2)) > 1e-3
+
+        # multiple prompts, single image conditioning
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"], inputs["prompt"]]
+        output_1 = sd_pipe(**inputs)
+
+        assert np.abs(image - output_1.images).max() < 1e-3
+
 
 class StableDiffusionMultiControlNetOneModelPipelineFastTests(
     PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py
index 158385db02..e9758bf6eb 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_k_diffusion.py
@@ -64,7 +64,7 @@ class StableDiffusionXLKPipelineIntegrationTests(unittest.TestCase):
 
         assert image.shape == (1, 512, 512, 3)
         expected_slice = np.array(
-            [0.79804534, 0.7981539, 0.8019961, 0.7936565, 0.7892033, 0.7914713, 0.7792827, 0.77754563, 0.7836789]
+            [0.79600024, 0.796546, 0.80682373, 0.79428387, 0.7905743, 0.8008807, 0.786183, 0.7835959, 0.797892]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -97,7 +97,7 @@ class StableDiffusionXLKPipelineIntegrationTests(unittest.TestCase):
 
         assert image.shape == (1, 512, 512, 3)
         expected_slice = np.array(
-            [0.9704869, 0.9714559, 0.9693254, 0.96892524, 0.9685236, 0.9659081, 0.9666761, 0.9619067, 0.961759]
+            [0.9389532, 0.9408587, 0.9394901, 0.939082, 0.9402114, 0.9382007, 0.93737566, 0.9346897, 0.9324472]
         )
 
         assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
index 1197842436..871266fb9c 100644
--- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
+++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
@@ -185,6 +185,23 @@ class StableVideoDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCa
     def test_inference_batch_consistent(self):
         pass
 
+    def test_np_output_type(self):
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        for component in pipe.components.values():
+            if hasattr(component, "set_default_attn_processor"):
+                component.set_default_attn_processor()
+
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator_device = "cpu"
+        inputs = self.get_dummy_inputs(generator_device)
+        inputs["output_type"] = "np"
+        output = pipe(**inputs).frames
+        self.assertTrue(isinstance(output, np.ndarray))
+        self.assertEqual(len(output.shape), 5)
+
     def test_dict_tuple_outputs_equivalent(self, expected_max_difference=1e-4):
         components = self.get_dummy_components()
         pipe = self.pipeline_class(**components)