From 0ab63ff6478b7cc6b5ae0d46c7c386d476cfa87f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tolga=20Cang=C3=B6z?=
 <46008593+standardAI@users.noreply.github.com>
Date: Fri, 24 May 2024 21:25:29 +0300
Subject: [PATCH] Fix CPU Offloading Usage & Typos (#8230)

* Fix typos

* Fix `pipe.enable_model_cpu_offload()` usage

* Fix cpu offloading

* Update numbers
---
 README.md                                     |  4 +-
 docs/source/en/optimization/tgate.md          | 24 ++++-----
 .../inference_with_tcd_lora.md                | 50 +++++++++----------
 docs/source/en/using-diffusers/inpaint.md     |  8 +--
 examples/community/README.md                  | 20 ++++----
 .../train_text_to_image_decoder.py            |  1 -
 tests/lora/test_lora_layers_sd.py             |  2 +-
 tests/pipelines/i2vgen_xl/test_i2vgenxl.py    |  1 -
 .../test_ip_adapter_stable_diffusion.py       |  4 +-
 .../test_stable_diffusion_v_pred.py           |  1 -
 .../test_stable_video_diffusion.py            |  1 -
 11 files changed, 56 insertions(+), 60 deletions(-)
diff --git a/README.md b/README.md
index a8a903e2c3..e598c023f4 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
 
 ## Quickstart
 
-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 22000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 25.000+ checkpoints):
 
 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
-- +9000 other amazing GitHub repositories 💪
+- +11.000 other amazing GitHub repositories 💪
 
 Thank you for using us ❤️.
 
diff --git a/docs/source/en/optimization/tgate.md b/docs/source/en/optimization/tgate.md
index d208ddfa84..90e0bc32f7 100644
--- a/docs/source/en/optimization/tgate.md
+++ b/docs/source/en/optimization/tgate.md
@@ -6,7 +6,7 @@ Before you begin, make sure you install T-GATE.
 
 ```bash
 pip install tgate
-pip install -U pytorch diffusers transformers accelerate DeepCache
+pip install -U torch diffusers transformers accelerate DeepCache
 ```
 
 
@@ -46,12 +46,12 @@ pipe = TgatePixArtLoader(
 
 image = pipe.tgate(
        "An alpaca made of colorful building blocks, cyberpunk.",
-        gate_step=gate_step,
+       gate_step=gate_step,
        num_inference_steps=inference_step,
 ).images[0]
 ```
 </hfoption>
-<hfoption id="Stable Diffusion XL"> 
+<hfoption id="Stable Diffusion XL">
 
 Accelerate `StableDiffusionXLPipeline` with T-GATE:
 
@@ -78,9 +78,9 @@ pipe = TgateSDXLLoader(
 ).to("cuda")
 
 image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
+       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+       gate_step=gate_step,
+       num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
@@ -111,9 +111,9 @@ pipe = TgateSDXLDeepCacheLoader(
 ).to("cuda")
 
 image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
+       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+       gate_step=gate_step,
+       num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
@@ -151,9 +151,9 @@ pipe = TgateSDXLLoader(
 ).to("cuda")
 
 image = pipe.tgate(
-        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
-        gate_step=gate_step,
-        num_inference_steps=inference_step
+       "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k.",
+       gate_step=gate_step,
+       num_inference_steps=inference_step
 ).images[0]
 ```
 </hfoption>
diff --git a/docs/source/en/using-diffusers/inference_with_tcd_lora.md b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
index 10ad674e73..df49fc8475 100644
--- a/docs/source/en/using-diffusers/inference_with_tcd_lora.md
+++ b/docs/source/en/using-diffusers/inference_with_tcd_lora.md
@@ -78,7 +78,7 @@ image = pipe(
     prompt=prompt,
     num_inference_steps=4,
     guidance_scale=0,
-    eta=0.3, 
+    eta=0.3,
     generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```
@@ -156,14 +156,14 @@ image = pipe(
     prompt=prompt,
     num_inference_steps=8,
     guidance_scale=0,
-    eta=0.3, 
+    eta=0.3,
     generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```
 
 ![](https://github.com/jabir-zheng/TCD/raw/main/assets/animagine_xl.png)
 
-TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method. 
+TCD-LoRA also supports other LoRAs trained on different styles. For example, let's load the [TheLastBen/Papercut_SDXL](https://huggingface.co/TheLastBen/Papercut_SDXL) LoRA and fuse it with the TCD-LoRA with the [`~loaders.UNet2DConditionLoadersMixin.set_adapters`] method.
 
 > [!TIP]
 > Check out the [Merge LoRAs](merge_loras) guide to learn more about efficient merging methods.
@@ -171,7 +171,7 @@ TCD-LoRA also supports other LoRAs trained on different styles. For example, let
 ```python
 import torch
 from diffusers import StableDiffusionXLPipeline
-from scheduling_tcd import TCDScheduler 
+from scheduling_tcd import TCDScheduler
 
 device = "cuda"
 base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -191,7 +191,7 @@ image = pipe(
     prompt=prompt,
     num_inference_steps=4,
     guidance_scale=0,
-    eta=0.3, 
+    eta=0.3,
     generator=torch.Generator(device=device).manual_seed(0),
 ).images[0]
 ```
@@ -215,7 +215,7 @@ from PIL import Image
 from transformers import DPTFeatureExtractor, DPTForDepthEstimation
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
-from scheduling_tcd import TCDScheduler 
+from scheduling_tcd import TCDScheduler
 
 device = "cuda"
 depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to(device)
@@ -249,13 +249,13 @@ controlnet = ControlNetModel.from_pretrained(
     controlnet_id,
     torch_dtype=torch.float16,
     variant="fp16",
-).to(device)
+)
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
     base_model_id,
     controlnet=controlnet,
     torch_dtype=torch.float16,
     variant="fp16",
-).to(device)
+)
 pipe.enable_model_cpu_offload()
 
 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -271,9 +271,9 @@ depth_image = get_depth_map(image)
 controlnet_conditioning_scale = 0.5  # recommended for good generalization
 
 image = pipe(
-    prompt, 
-    image=depth_image, 
-    num_inference_steps=4, 
+    prompt,
+    image=depth_image,
+    num_inference_steps=4,
     guidance_scale=0,
     eta=0.3,
     controlnet_conditioning_scale=controlnet_conditioning_scale,
@@ -290,7 +290,7 @@ grid_image = make_image_grid([depth_image, image], rows=1, cols=2)
 import torch
 from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
 from diffusers.utils import load_image, make_image_grid
-from scheduling_tcd import TCDScheduler 
+from scheduling_tcd import TCDScheduler
 
 device = "cuda"
 base_model_id = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -301,13 +301,13 @@ controlnet = ControlNetModel.from_pretrained(
     controlnet_id,
     torch_dtype=torch.float16,
     variant="fp16",
-).to(device)
+)
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
     base_model_id,
     controlnet=controlnet,
     torch_dtype=torch.float16,
     variant="fp16",
-).to(device)
+)
 pipe.enable_model_cpu_offload()
 
 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -322,9 +322,9 @@ canny_image = load_image("https://huggingface.co/datasets/hf-internal-testing/di
 controlnet_conditioning_scale = 0.5  # recommended for good generalization
 
 image = pipe(
-    prompt, 
-    image=canny_image, 
-    num_inference_steps=4, 
+    prompt,
+    image=canny_image,
+    num_inference_steps=4,
     guidance_scale=0,
     eta=0.3,
     controlnet_conditioning_scale=controlnet_conditioning_scale,
@@ -336,7 +336,7 @@ grid_image = make_image_grid([canny_image, image], rows=1, cols=2)
 ![](https://github.com/jabir-zheng/TCD/raw/main/assets/controlnet_canny_tcd.png)
 
 <Tip>
-The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one. 
+The inference parameters in this example might not work for all examples, so we recommend you to try different values for `num_inference_steps`, `guidance_scale`, `controlnet_conditioning_scale` and `cross_attention_kwargs` parameters and choose the best one.
 </Tip>
 
 </hfoption>
@@ -350,7 +350,7 @@ from diffusers import StableDiffusionXLPipeline
 from diffusers.utils import load_image, make_image_grid
 
 from ip_adapter import IPAdapterXL
-from scheduling_tcd import TCDScheduler 
+from scheduling_tcd import TCDScheduler
 
 device = "cuda"
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
@@ -359,8 +359,8 @@ ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
 tcd_lora_id = "h1t/TCD-SDXL-LoRA"
 
 pipe = StableDiffusionXLPipeline.from_pretrained(
-    base_model_path, 
-    torch_dtype=torch.float16, 
+    base_model_path,
+    torch_dtype=torch.float16,
     variant="fp16"
 )
 pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
@@ -375,13 +375,13 @@ ref_image = load_image("https://raw.githubusercontent.com/tencent-ailab/IP-Adapt
 prompt = "best quality, high quality, wearing sunglasses"
 
 image = ip_model.generate(
-    pil_image=ref_image, 
+    pil_image=ref_image,
     prompt=prompt,
     scale=0.5,
-    num_samples=1, 
-    num_inference_steps=4, 
+    num_samples=1,
+    num_inference_steps=4,
     guidance_scale=0,
-    eta=0.3, 
+    eta=0.3,
     seed=0,
 )[0]
 
diff --git a/docs/source/en/using-diffusers/inpaint.md b/docs/source/en/using-diffusers/inpaint.md
index 193f5a6d9f..ba43325f53 100644
--- a/docs/source/en/using-diffusers/inpaint.md
+++ b/docs/source/en/using-diffusers/inpaint.md
@@ -230,7 +230,7 @@ from diffusers.utils import load_image, make_image_grid
 
 pipeline = AutoPipelineForInpainting.from_pretrained(
     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -255,7 +255,7 @@ from diffusers.utils import load_image, make_image_grid
 
 pipeline = AutoPipelineForInpainting.from_pretrained(
     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -296,7 +296,7 @@ from diffusers.utils import load_image, make_image_grid
 
 pipeline = AutoPipelineForInpainting.from_pretrained(
     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
@@ -319,7 +319,7 @@ from diffusers.utils import load_image, make_image_grid
 
 pipeline = AutoPipelineForInpainting.from_pretrained(
     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
+)
 pipeline.enable_model_cpu_offload()
 # remove following line if xFormers is not installed or you have PyTorch 2.0 or higher installed
 pipeline.enable_xformers_memory_efficient_attention()
diff --git a/examples/community/README.md b/examples/community/README.md
index 6d7830621e..600761aae7 100755
--- a/examples/community/README.md
+++ b/examples/community/README.md
@@ -240,12 +240,12 @@ pipeline_output = pipe(
     # denoising_steps=10,     # (optional) Number of denoising steps of each inference pass. Default: 10.
     # ensemble_size=10,       # (optional) Number of inference passes in the ensemble. Default: 10.
     # ------------------------------------------------
-    
+
     # ----- recommended setting for LCM version ------
     # denoising_steps=4,
     # ensemble_size=5,
     # -------------------------------------------------
-    
+
     # processing_res=768,     # (optional) Maximum resolution of processing. If set to 0: will not resize at all. Defaults to 768.
     # match_input_res=True,   # (optional) Resize depth prediction to match input resolution.
     # batch_size=0,           # (optional) Inference batch size, no bigger than `num_ensemble`. If set to 0, the script will automatically decide the proper batch size. Defaults to 0.
@@ -1032,7 +1032,7 @@ image = pipe().images[0]
 
 Make sure you have @crowsonkb's <https://github.com/crowsonkb/k-diffusion> installed:
 
-```
+```sh
 pip install k-diffusion
 ```
 
@@ -1854,13 +1854,13 @@ To use this pipeline, you need to:
 
 You can simply use pip to install IPEX with the latest version.
 
-```python
+```sh
 python -m pip install intel_extension_for_pytorch
 ```
 
 **Note:** To install a specific version, run with the following command:
 
-```
+```sh
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
 
@@ -1958,13 +1958,13 @@ To use this pipeline, you need to:
 
 You can simply use pip to install IPEX with the latest version.
 
-```python
+```sh
 python -m pip install intel_extension_for_pytorch
 ```
 
 **Note:** To install a specific version, run with the following command:
 
-```
+```sh
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
 
@@ -3010,8 +3010,8 @@ This code implements a pipeline for the Stable Diffusion model, enabling the div
 
 ### Sample Code
 
-```
-from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
+```py
+from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
 pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)
 
 rp_args = {
@@ -4131,7 +4131,7 @@ This implementation is based on [Diffusers](https://huggingface.co/docs/diffuser
 
 ## Example Usage
 
-```
+```py
 import os
 import torch
 
diff --git a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
index 78f9b7f18b..409978cb53 100644
--- a/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
+++ b/examples/kandinsky2_2/text_to_image/train_text_to_image_decoder.py
@@ -896,7 +896,6 @@ def main():
         images = []
         if args.validation_prompts is not None:
             logger.info("Running inference for collecting generated images...")
-            pipeline = pipeline.to(accelerator.device)
             pipeline.torch_dtype = weight_dtype
             pipeline.set_progress_bar_config(disable=True)
             pipeline.enable_model_cpu_offload()
diff --git a/tests/lora/test_lora_layers_sd.py b/tests/lora/test_lora_layers_sd.py
index fc28d94c24..46b965ec33 100644
--- a/tests/lora/test_lora_layers_sd.py
+++ b/tests/lora/test_lora_layers_sd.py
@@ -642,7 +642,7 @@ class LoraIntegrationTests(unittest.TestCase):
         This test simply checks that loading a LoRA with an empty network alpha works fine
         See: https://github.com/huggingface/diffusers/issues/5606
         """
-        pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
+        pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
         pipeline.enable_sequential_cpu_offload()
         civitai_path = hf_hub_download("ybelkada/test-ahi-civitai", "ahi_lora_weights.safetensors")
         pipeline.load_lora_weights(civitai_path, adapter_name="ahri")
diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
index 0273e972a6..426e258122 100644
--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -243,7 +243,6 @@ class I2VGenXLPipelineSlowTests(unittest.TestCase):
 
     def test_i2vgen_xl(self):
         pipe = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-        pipe = pipe.to(torch_device)
         pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
         image = load_image(
diff --git a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
index 8c95fbc703..bf74b2f060 100644
--- a/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
+++ b/tests/pipelines/ip_adapters/test_ip_adapter_stable_diffusion.py
@@ -612,10 +612,10 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
     def test_instant_style_multiple_masks(self):
         image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "h94/IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16
-        ).to("cuda")
+        )
         pipeline = StableDiffusionXLPipeline.from_pretrained(
             "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.float16, image_encoder=image_encoder, variant="fp16"
-        ).to("cuda")
+        )
         pipeline.enable_model_cpu_offload()
 
         pipeline.load_ip_adapter(
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
index c3b1b9b854..923fba1272 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
@@ -420,7 +420,6 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
         pipe.scheduler = DDIMScheduler.from_config(
             pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True
         )
-        pipe.to(torch_device)
         pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
 
diff --git a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
index 199ed57bc2..60fc21e202 100644
--- a/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
+++ b/tests/pipelines/stable_video_diffusion/test_stable_video_diffusion.py
@@ -534,7 +534,6 @@ class StableVideoDiffusionPipelineSlowTests(unittest.TestCase):
             variant="fp16",
             torch_dtype=torch.float16,
         )
-        pipe = pipe.to(torch_device)
         pipe.enable_model_cpu_offload()
         pipe.set_progress_bar_config(disable=None)
         image = load_image(