From 62a151d8f46cf9cb66f7932b9d9fb31bef70d90a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sun, 25 Jun 2023 21:26:31 +0000
Subject: [PATCH] proof that in works in run local xl

---
 run_local_xl.py                               | 56 +++++++++++++++++++
 .../pipeline_stable_diffusion_upscale.py      |  3 +-
 .../pipeline_stable_diffusion_xl.py           | 26 +++++++--
 3 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100755 run_local_xl.py

diff --git a/run_local_xl.py b/run_local_xl.py
new file mode 100755
index 0000000000..db41d2cdf9
--- /dev/null
+++ b/run_local_xl.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler, StableDiffusionPipeline, KDPM2DiscreteScheduler, StableDiffusionImg2ImgPipeline, HeunDiscreteScheduler, KDPM2AncestralDiscreteScheduler, DDIMScheduler
+import time
+import os
+from huggingface_hub import HfApi
+# from compel import Compel
+import torch
+import sys
+from pathlib import Path
+import requests
+from PIL import Image
+from io import BytesIO
+
+path = sys.argv[1]
+
+api = HfApi()
+start_time = time.time()
+pipe = DiffusionPipeline.from_pretrained(path, torch_dtype=torch.float16)
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+# pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+# pipe = StableDiffusionImg2ImgPipeline.from_pretrained(path, torch_dtype=torch.float16, safety_checker=None
+
+# compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
+
+
+pipe = pipe.to("cuda")
+
+prompt = "An astronaut riding a green horse on Mars"
+
+# rompts = ["a cat playing with a ball++ in the forest", "a cat playing with a ball++ in the forest", "a cat playing with a ball-- in the forest"]
+
+# prompt_embeds = torch.cat([compel.build_conditioning_tensor(prompt) for prompt in prompts])
+
+# generator = [torch.Generator(device="cuda").manual_seed(0) for _ in range(prompt_embeds.shape[0])]
+#
+# url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+# 
+# response = requests.get(url)
+# image = Image.open(BytesIO(response.content)).convert("RGB")
+# image.thumbnail((768, 768))
+#
+
+# pipe.unet.set_default_attn_processor()
+image = pipe(prompt=prompt).images[0]
+
+file_name = f"aaa"
+path = os.path.join(Path.home(), "images", f"{file_name}.png")
+image.save(path)
+
+api.upload_file(
+    path_or_fileobj=path,
+    path_in_repo=path.split("/")[-1],
+    repo_id="patrickvonplaten/images",
+    repo_type="dataset",
+)
+print(f"https://huggingface.co/datasets/patrickvonplaten/images/blob/main/{file_name}.png")
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 0fda05ea5e..06b6628bd3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from ...image_processor import VaeImageProcessor
 from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor
+from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, LoRAAttnProcessor2_0
 from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
@@ -747,6 +747,7 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
             AttnProcessor2_0,
             XFormersAttnProcessor,
             LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
         ]
         # if xformers or torch_2_0 is used attention block does not need
         # to be in float32 which can save lots of memory
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 85a8959f69..14e674ca86 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -25,6 +25,7 @@ from ...image_processor import VaeImageProcessor
 from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
+from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, LoRAAttnProcessor2_0
 from ...utils import (
     deprecate,
     is_accelerate_available,
@@ -648,8 +649,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline):
         )
         prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds = self.encode_prompt(
             prompt,
-            "cpu",
-            # device,
+            device,
             num_images_per_prompt,
             do_classifier_free_guidance,
             negative_prompt,
@@ -727,10 +727,28 @@ class StableDiffusionXLPipeline(DiffusionPipeline):
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
 
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        self.vae.to(dtype=torch.float32)
+
+        use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
+            AttnProcessor2_0,
+            XFormersAttnProcessor,
+            LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
+        ]
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if not use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(latents.dtype)
+            self.vae.decoder.conv_in.to(latents.dtype)
+            self.vae.decoder.mid_block.to(latents.dtype)
+        else:
+            latents = latents.float()
+
+
         if not output_type == "latent":
             # CHECK there is problem here (PVP)
-            with torch.autocast("cuda", enabled=False):
-                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
             #image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
             has_nsfw_concept = None
         else: