diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py index 61487cde15..fbe0d22a52 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/before_denoise.py @@ -1697,11 +1697,6 @@ class StableDiffusionXLControlNetUnionInputStep(ModularPipelineBlocks): InputParam("controlnet_conditioning_scale", default=1.0), InputParam("guess_mode", default=False), InputParam("num_images_per_prompt", default=1), - ] - - @property - def intermediate_inputs(self) -> List[InputParam]: - return [ InputParam( "latents", required=True, diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py index 38fa3b5c51..feb78e1ef1 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/decoders.py @@ -179,11 +179,6 @@ class StableDiffusionXLInpaintOverlayMaskStep(ModularPipelineBlocks): InputParam("image"), InputParam("mask_image"), InputParam("padding_mask_crop"), - ] - - @property - def intermediate_inputs(self) -> List[str]: - return [ InputParam( "images", type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]], diff --git a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py index 8926d6c1fb..1e8921d363 100644 --- a/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py +++ b/src/diffusers/modular_pipelines/stable_diffusion_xl/encoders.py @@ -663,12 +663,11 @@ class StableDiffusionXLVaeEncoderStep(ModularPipelineBlocks): block_state.device = components._execution_device block_state.dtype = block_state.dtype if block_state.dtype is not None else components.vae.dtype - block_state.image = components.image_processor.preprocess( + image = components.image_processor.preprocess( block_state.image, height=block_state.height, width=block_state.width, **block_state.preprocess_kwargs ) - block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype) - - block_state.batch_size = block_state.image.shape[0] + image = image.to(device=block_state.device, dtype=block_state.dtype) + block_state.batch_size = image.shape[0] # if generator is a list, make sure the length of it matches the length of images (both should be batch_size) if isinstance(block_state.generator, list) and len(block_state.generator) != block_state.batch_size: @@ -677,9 +676,7 @@ class StableDiffusionXLVaeEncoderStep(ModularPipelineBlocks): f" size of {block_state.batch_size}. Make sure the batch size matches the length of the generators." ) - block_state.image_latents = self._encode_vae_image( - components, image=block_state.image, generator=block_state.generator - ) + block_state.image_latents = self._encode_vae_image(components, image=image, generator=block_state.generator) self.set_block_state(state, block_state) @@ -850,34 +847,32 @@ class StableDiffusionXLInpaintVaeEncoderStep(ModularPipelineBlocks): block_state.crops_coords = None block_state.resize_mode = "default" - block_state.image = components.image_processor.preprocess( + image = components.image_processor.preprocess( block_state.image, height=block_state.height, width=block_state.width, crops_coords=block_state.crops_coords, resize_mode=block_state.resize_mode, ) - block_state.image = block_state.image.to(dtype=torch.float32) + image = image.to(dtype=torch.float32) - block_state.mask = components.mask_processor.preprocess( + mask = components.mask_processor.preprocess( block_state.mask_image, height=block_state.height, width=block_state.width, resize_mode=block_state.resize_mode, crops_coords=block_state.crops_coords, ) - block_state.masked_image = block_state.image * (block_state.mask < 0.5) + block_state.masked_image = image * (mask < 0.5) - block_state.batch_size = block_state.image.shape[0] - block_state.image = block_state.image.to(device=block_state.device, dtype=block_state.dtype) - block_state.image_latents = self._encode_vae_image( - components, image=block_state.image, generator=block_state.generator - ) + block_state.batch_size = image.shape[0] + image = image.to(device=block_state.device, dtype=block_state.dtype) + block_state.image_latents = self._encode_vae_image(components, image=image, generator=block_state.generator) # 7. Prepare mask latent variables block_state.mask, block_state.masked_image_latents = self.prepare_mask_latents( components, - block_state.mask, + mask, block_state.masked_image, block_state.batch_size, block_state.height,