From d06750a5fd19781de68066bb34a3520af83cf124 Mon Sep 17 00:00:00 2001 From: Zijian Zhou Date: Wed, 17 Sep 2025 00:43:15 +0100 Subject: [PATCH] Fix autoencoder_kl_wan.py bugs for Wan2.2 VAE (#12335) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update autoencoder_kl_wan.py When using the Wan2.2 VAE, the spatial compression ratio calculated here is incorrect. It should be 16 instead of 8. Pass it in directly via the config to ensure it’s correct here. * Update autoencoder_kl_wan.py --- src/diffusers/models/autoencoders/autoencoder_kl_wan.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py index d84a0861e9..e6e58c1cce 100644 --- a/src/diffusers/models/autoencoders/autoencoder_kl_wan.py +++ b/src/diffusers/models/autoencoders/autoencoder_kl_wan.py @@ -1052,7 +1052,7 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin): is_residual=is_residual, ) - self.spatial_compression_ratio = 2 ** len(self.temperal_downsample) + self.spatial_compression_ratio = scale_factor_spatial # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension # to perform decoding of a single video latent at a time. @@ -1145,12 +1145,13 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin): def _encode(self, x: torch.Tensor): _, _, num_frame, height, width = x.shape - if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): - return self.tiled_encode(x) - self.clear_cache() if self.config.patch_size is not None: x = patchify(x, patch_size=self.config.patch_size) + + if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height): + return self.tiled_encode(x) + iter_ = 1 + (num_frame - 1) // 4 for i in range(iter_): self._enc_conv_idx = [0]