From 2fb9baf934629db718f00d4eaeeddd9dac80dfdd Mon Sep 17 00:00:00 2001
From: Dhruv Nair <dhruv.nair@gmail.com>
Date: Fri, 19 Jan 2024 07:24:19 +0000
Subject: [PATCH] update

---
 src/diffusers/loaders/autoencoder.py          | 222 ------------------
 src/diffusers/loaders/controlnet.py           | 167 -------------
 src/diffusers/loaders/single_file.py          |   3 +-
 src/diffusers/loaders/single_file_utils.py    |  34 +--
 .../models/autoencoders/autoencoder_kl.py     |   4 +-
 5 files changed, 16 insertions(+), 414 deletions(-)
 delete mode 100644 src/diffusers/loaders/autoencoder.py
 delete mode 100644 src/diffusers/loaders/controlnet.py

diff --git a/src/diffusers/loaders/autoencoder.py b/src/diffusers/loaders/autoencoder.py
deleted file mode 100644
index 94240c0f4b..0000000000
--- a/src/diffusers/loaders/autoencoder.py
+++ /dev/null
@@ -1,222 +0,0 @@
-from contextlib import nullcontext
-from io import BytesIO
-from pathlib import Path
-
-import requests
-import torch
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import validate_hf_hub_args
-
-from ..utils import (
-    is_accelerate_available,
-    is_transformers_available,
-    logging,
-)
-from ..utils.import_utils import BACKENDS_MAPPING
-
-
-if is_transformers_available():
-    pass
-
-if is_accelerate_available():
-    from accelerate import init_empty_weights
-
-logger = logging.get_logger(__name__)
-
-
-class FromOriginalVAEMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into an [`AutoencoderKL`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            scaling_factor (`float`, *optional*, defaults to 0.18215):
-                The component-wise standard deviation of the trained latent space computed using the first batch of the
-                training set. This is used to scale the latent space to have unit variance when training the diffusion
-                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
-                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
-                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        <Tip warning={true}>
-
-            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
-            a VAE from SDXL or a Stable Diffusion v2 model or higher.
-
-        </Tip>
-
-        Examples:
-
-        ```py
-        from diffusers import AutoencoderKL
-
-        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
-        model = AutoencoderKL.from_single_file(url)
-        ```
-        """
-        if not is_omegaconf_available():
-            raise ValueError(BACKENDS_MAPPING["omegaconf"][1])
-
-        from omegaconf import OmegaConf
-
-        from ..models import AutoencoderKL
-
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import (
-            convert_ldm_vae_checkpoint,
-            create_vae_diffusers_config,
-        )
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        revision = kwargs.pop("revision", None)
-        image_size = kwargs.pop("image_size", None)
-        scaling_factor = kwargs.pop("scaling_factor", None)
-        kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if from_safetensors:
-            from safetensors import safe_open
-
-            checkpoint = {}
-            with safe_open(pretrained_model_link_or_path, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    checkpoint[key] = f.get_tensor(key)
-        else:
-            checkpoint = torch.load(pretrained_model_link_or_path, map_location="cpu")
-
-        if "state_dict" in checkpoint:
-            checkpoint = checkpoint["state_dict"]
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        original_config = OmegaConf.load(config_file)
-
-        # default to sd-v1-5
-        image_size = image_size or 512
-
-        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
-        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
-
-        if scaling_factor is None:
-            if (
-                "model" in original_config
-                and "params" in original_config.model
-                and "scale_factor" in original_config.model.params
-            ):
-                vae_scaling_factor = original_config.model.params.scale_factor
-            else:
-                vae_scaling_factor = 0.18215  # default SD scaling factor
-
-        vae_config["scaling_factor"] = vae_scaling_factor
-
-        ctx = init_empty_weights if is_accelerate_available() else nullcontext
-        with ctx():
-            vae = AutoencoderKL(**vae_config)
-
-        if is_accelerate_available():
-            from ..models.modeling_utils import load_model_dict_into_meta
-
-            load_model_dict_into_meta(vae, converted_vae_checkpoint, device="cpu")
-        else:
-            vae.load_state_dict(converted_vae_checkpoint)
-
-        if torch_dtype is not None:
-            vae.to(dtype=torch_dtype)
-
-        return vae
diff --git a/src/diffusers/loaders/controlnet.py b/src/diffusers/loaders/controlnet.py
deleted file mode 100644
index 4f709d75be..0000000000
--- a/src/diffusers/loaders/controlnet.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from io import BytesIO
-from pathlib import Path
-
-import requests
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import validate_hf_hub_args
-
-
-class FromOriginalControlnetMixin:
-    """
-    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
-    """
-
-    @classmethod
-    @validate_hf_hub_args
-    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
-        r"""
-        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
-        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
-
-        Parameters:
-            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A link to the `.ckpt` file (for example
-                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
-                    - A path to a *file* containing all pipeline weights.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
-                incompletely downloaded files are deleted.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only (`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to True, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
-                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
-                weights. If set to `False`, safetensors weights are not loaded.
-            image_size (`int`, *optional*, defaults to 512):
-                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
-                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
-            upcast_attention (`bool`, *optional*, defaults to `None`):
-                Whether the attention computation should always be upcasted.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to overwrite load and saveable variables (for example the pipeline components of the
-                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
-                method. See example below for more information.
-
-        Examples:
-
-        ```py
-        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-
-        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
-        model = ControlNetModel.from_single_file(url)
-
-        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
-        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
-        ```
-        """
-        # import here to avoid circular dependency
-        from ..pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt
-
-        config_file = kwargs.pop("config_file", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        resume_download = kwargs.pop("resume_download", False)
-        force_download = kwargs.pop("force_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", None)
-        token = kwargs.pop("token", None)
-        num_in_channels = kwargs.pop("num_in_channels", None)
-        use_linear_projection = kwargs.pop("use_linear_projection", None)
-        revision = kwargs.pop("revision", None)
-        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
-        upcast_attention = kwargs.pop("upcast_attention", None)
-
-        torch_dtype = kwargs.pop("torch_dtype", None)
-
-        use_safetensors = kwargs.pop("use_safetensors", None)
-
-        file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
-        from_safetensors = file_extension == "safetensors"
-
-        if from_safetensors and use_safetensors is False:
-            raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
-
-        # remove huggingface url
-        for prefix in ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]:
-            if pretrained_model_link_or_path.startswith(prefix):
-                pretrained_model_link_or_path = pretrained_model_link_or_path[len(prefix) :]
-
-        # Code based on diffusers.pipelines.pipeline_utils.DiffusionPipeline.from_pretrained
-        ckpt_path = Path(pretrained_model_link_or_path)
-        if not ckpt_path.is_file():
-            # get repo_id and (potentially nested) file path of ckpt in repo
-            repo_id = "/".join(ckpt_path.parts[:2])
-            file_path = "/".join(ckpt_path.parts[2:])
-
-            if file_path.startswith("blob/"):
-                file_path = file_path[len("blob/") :]
-
-            if file_path.startswith("main/"):
-                file_path = file_path[len("main/") :]
-
-            pretrained_model_link_or_path = hf_hub_download(
-                repo_id,
-                filename=file_path,
-                cache_dir=cache_dir,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                force_download=force_download,
-            )
-
-        if config_file is None:
-            config_url = "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml"
-            config_file = BytesIO(requests.get(config_url).content)
-
-        image_size = image_size or 512
-
-        controlnet = download_controlnet_from_original_ckpt(
-            pretrained_model_link_or_path,
-            original_config_file=config_file,
-            image_size=image_size,
-            extract_ema=extract_ema,
-            num_in_channels=num_in_channels,
-            upcast_attention=upcast_attention,
-            from_safetensors=from_safetensors,
-            use_linear_projection=use_linear_projection,
-        )
-
-        if torch_dtype is not None:
-            controlnet.to(dtype=torch_dtype)
-
-        return controlnet
diff --git a/src/diffusers/loaders/single_file.py b/src/diffusers/loaders/single_file.py
index 1896440b6c..d07b58bac5 100644
--- a/src/diffusers/loaders/single_file.py
+++ b/src/diffusers/loaders/single_file.py
@@ -231,7 +231,6 @@ class FromSingleFileMixin:
         ```
         """
         original_config_file = kwargs.pop("original_config_file", None)
-        config_files = kwargs.pop("config_files", None)
         resume_download = kwargs.pop("resume_download", False)
         force_download = kwargs.pop("force_download", False)
         proxies = kwargs.pop("proxies", None)
@@ -270,7 +269,7 @@ class FromSingleFileMixin:
         while "state_dict" in checkpoint:
             checkpoint = checkpoint["state_dict"]
 
-        original_config = fetch_original_config(class_name, checkpoint, original_config_file, config_files)
+        original_config = fetch_original_config(class_name, checkpoint, original_config_file)
 
         if class_name == "AutoencoderKL":
             image_size = kwargs.pop("image_size", None)
diff --git a/src/diffusers/loaders/single_file_utils.py b/src/diffusers/loaders/single_file_utils.py
index a35ec5b390..3ab0aa716f 100644
--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """ Conversion script for the Stable Diffusion checkpoints."""
 
+import os
+import re
 from contextlib import nullcontext
 from io import BytesIO
 
@@ -188,7 +190,7 @@ SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [
 ]
 
 
-def fetch_original_config_file_from_url(class_name, checkpoint):
+def infer_original_config_file(class_name, checkpoint):
     if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
         config_url = CONFIG_URLS["v2"]
 
@@ -212,30 +214,20 @@ def fetch_original_config_file_from_url(class_name, checkpoint):
     return original_config_file
 
 
-def fetch_original_config_file_from_file(config_files: list):
-    if "v2" in config_files:
-        return config_files["v2"]
+def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None):
 
-    elif "xl" in config_files:
-        return config_files["xl"]
+    def is_valid_url(url):
+        pattern = r'^(http|https):\/\/([\w.-]+)(\.[\w.-]+)+([\/\w\.-]*)*\/?$'
+        return bool(re.match(pattern, url))
 
-    elif "xl_refiner" in config_files:
-        return config_files["xl_refiner"]
+    if os.path.isfile(original_config_file):
+        with open(original_config_file, "r") as fp:
+            original_config_file = fp.read()
 
+    elif is_valid_url(original_config_file):
+        original_config_file = BytesIO(requests.get(original_config_file).content)
     else:
-        return config_files["v1"]
-
-
-def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None, config_files=None):
-    if original_config_file:
-        original_config = yaml.safe_load(original_config_file)
-        return original_config
-
-    elif config_files:
-        original_config_file = fetch_original_config_file_from_file(config_files)
-
-    else:
-        original_config_file = fetch_original_config_file_from_url(pipeline_class_name, checkpoint)
+        original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)
 
     original_config = yaml.safe_load(original_config_file)
 
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl.py b/src/diffusers/models/autoencoders/autoencoder_kl.py
index 10a3ae58de..92d12a220f 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -17,7 +17,7 @@ import torch
 import torch.nn as nn
 
 from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalVAEMixin
+from ...loaders import FromSingleFileMixin
 from ...utils.accelerate_utils import apply_forward_hook
 from ..attention_processor import (
     ADDED_KV_ATTENTION_PROCESSORS,
@@ -32,7 +32,7 @@ from ..modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
-class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+class AutoencoderKL(ModelMixin, ConfigMixin, FromSingleFileMixin):
     r"""
     A VAE model with KL loss for encoding images into latents and decoding latent representations into images.