From fe2b39742604e3551e55b9d72c6c75f723100a0a Mon Sep 17 00:00:00 2001
From: Bruno Magalhaes <bruno.magalhaes@synthesia.io>
Date: Wed, 2 Apr 2025 09:19:51 +0200
Subject: [PATCH] remove unnecessary call to `F.pad` (#10620)

* rewrite memory count without implicitly using dimensions by @ic-synth

* replace F.pad by built-in padding in Conv3D

* in-place sums to reduce memory allocations

* fixed trailing whitespace

* file reformatted

* in-place sums

* simpler in-place expressions

* removed in-place sum, may affect backward propagation logic

* removed in-place sum, may affect backward propagation logic

* removed in-place sum, may affect backward propagation logic

* reverted change
---
 .../models/autoencoders/autoencoder_kl_cogvideox.py          | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
index 829e0fe54d..e2b2639689 100644
--- a/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_cogvideox.py
@@ -105,6 +105,7 @@ class CogVideoXCausalConv3d(nn.Module):
         self.width_pad = width_pad
         self.time_pad = time_pad
         self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        self.const_padding_conv3d = (0, self.width_pad, self.height_pad)
 
         self.temporal_dim = 2
         self.time_kernel_size = time_kernel_size
@@ -117,6 +118,8 @@ class CogVideoXCausalConv3d(nn.Module):
             kernel_size=kernel_size,
             stride=stride,
             dilation=dilation,
+            padding=0 if self.pad_mode == "replicate" else self.const_padding_conv3d,
+            padding_mode="zeros",
         )
 
     def fake_context_parallel_forward(
@@ -137,9 +140,7 @@ class CogVideoXCausalConv3d(nn.Module):
         if self.pad_mode == "replicate":
             conv_cache = None
         else:
-            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
             conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
-            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
 
         output = self.conv(inputs)
         return output, conv_cache