Add support for sample clipping for numerical stability and deprecate old kwarg

2026-01-29 07:22:12 +03:00 · 2023-03-10 15:26:11 -08:00
parent 5228f55a14
commit e5f5b4af30
2 changed files with 25 additions and 11 deletions
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py
@@ -1156,8 +1156,8 @@ class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):

        # 7. Denoising loop where we obtain the cross-attention maps.
        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
+        with self.progress_bar(total=num_inference_steps - 1) as progress_bar:
+            for i, t in enumerate(timesteps[:-1]):
                # expand the latents if we are doing classifier free guidance
                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
--- a/src/diffusers/schedulers/scheduling_ddim_inverse.py
+++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -96,15 +96,17 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
        trained_betas (`np.ndarray`, optional):
            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
        clip_sample (`bool`, default `True`):
-            option to clip predicted sample between -1 and 1 for numerical stability.
-        set_alpha_to_one (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_zero (`bool`, default `True`):
            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
-            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the value of alpha at step 0.
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `0`,
+            otherwise it uses the value of alpha at step `num_train_timesteps - 1`.
        steps_offset (`int`, default `0`):
            an offset added to the inference steps. You can use a combination of `offset=1` and
-            `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
-            stable diffusion.
+            `set_alpha_to_zero=False`, to make the last step use step `num_train_timesteps - 1`
+            for the previous alpha product.
        prediction_type (`str`, default `epsilon`, optional):
            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
@@ -125,7 +127,13 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
        set_alpha_to_zero: bool = True,
        steps_offset: int = 0,
        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        **kwargs
    ):
+        if kwargs.get("set_alpha_to_one", None) is not None:
+            deprecation_message = "The `set_alpha_to_one` argument is deprecated. Please use `set_alpha_to_zero` instead."
+            deprecate("set_alpha_to_one", "1.0.0", deprecation_message, standard_warn=False)
+            set_alpha_to_zero = kwargs["set_alpha_to_one"]
        if trained_betas is not None:
            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
        elif beta_schedule == "linear":
@@ -147,7 +155,7 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
        # At every step in inverted ddim, we are looking into the next alphas_cumprod
        # For the final step, there is no next alphas_cumprod, and the index is out of bounds
        # `set_alpha_to_zero` decides whether we set this parameter simply to zero
-        # in this case, self.step() just normalizes output by self.config.prediction_type
+        # in this case, self.step() just output the predicted noise
        # or whether we use the final alpha of the "non-previous" one.
        self.final_alpha_cumprod = torch.tensor(0.0) if set_alpha_to_zero else self.alphas_cumprod[-1]

@@ -237,10 +245,16 @@ class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
                " `v_prediction`"
            )

-        # 4. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon

-        # 5. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction

        if not return_dict: