From e5f5b4af30d69a8dac53fc76fa805e46ab6c528f Mon Sep 17 00:00:00 2001
From: Clarence Chen <clarencechenct@gmail.com>
Date: Fri, 10 Mar 2023 15:26:11 -0800
Subject: [PATCH] Add support for sample clipping for numerical stability and
 deprecate old kwarg

---
 .../pipeline_stable_diffusion_pix2pix_zero.py |  4 +--
 .../schedulers/scheduling_ddim_inverse.py     | 32 +++++++++++++------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
index 8690e9f0b7..e705306139 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -1156,8 +1156,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
 
         # 7. Denoising loop where we obtain the cross-attention maps.
         num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
+        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
+            for i, t in enumerate(timesteps[:-1]):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                 latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py
index 113afcef55..0e510b3d2c 100644
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -96,15 +96,17 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
         trained_betas (`np.ndarray`, optional):
             option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
         clip_sample (`bool`, default `True`):
-            option to clip predicted sample between -1 and 1 for numerical stability.
-        set_alpha_to_one (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_zero (`bool`, default `True`):
             each diffusion step uses the value of alphas product at that step and at the previous one. For the final
-            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the value of alpha at step 0.
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `0`,
+            otherwise it uses the value of alpha at step `num_train_timesteps - 1`.
         steps_offset (`int`, default `0`):
             an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
+            `set_alpha_to_zero=False`, to make the last step use step `num_train_timesteps - 1`
+            for the previous alpha product.
         prediction_type (`str`, default `epsilon`, optional):
             prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
             process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
@@ -125,7 +127,13 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
         set_alpha_to_zero: bool = True,
         steps_offset: int = 0,
         prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        **kwargs
     ):
+        if kwargs.get("set_alpha_to_one", None) is not None:
+            deprecation_message = "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
+            deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
+            set_alpha_to_zero = kwargs["set_alpha_to_one"]
         if trained_betas is not None:
             self.betas = torch.tensor(trained_betas, dtype=torch.float32)
         elif beta_schedule == "linear":
@@ -147,7 +155,7 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
         # At every step in inverted ddim, we are looking into the next alphas_cumprod
         # For the final step, there is no next alphas_cumprod, and the index is out of bounds
         # `set_alpha_to_zero` decides whether we set this parameter simply to zero
-        # in this case, self.step() just normalizes output by self.config.prediction_type
+        # in this case, self.step() just output the predicted noise
         # or whether we use the final alpha of the "non-previous" one.
         self.final_alpha_cumprod = torch.tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]
 
@@ -237,10 +245,16 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
                 " `v_prediction`"
             )
 
-        # 4. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
         pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
 
-        # 5. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
         prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
 
         if not return_dict: