diff --git a/examples/cogvideo/README.md b/examples/cogvideo/README.md index a3357b031d..398ae95431 100644 --- a/examples/cogvideo/README.md +++ b/examples/cogvideo/README.md @@ -172,9 +172,6 @@ accelerate launch --gpu_ids $GPU_IDS examples/cogvideo/train_cogvideox_lora.py \ --report_to wandb ``` -> [!NOTE] -> At the time of adding support for CogVideoX-LoRA training, the memory required by the training script, with VAE tiling and LoRA rank 64, is ~52 GB (as tested with the simplest `accelerate config` setting) and ~46 GB (as tested with the simplest `accelerate config` DeepSpeed ZeRO-2 training settings). - To better track our training experiments, we're using the following flags in the command above: * `--report_to wandb` will ensure the training runs are tracked on Weights and Biases. To use it, be sure to install `wandb` with `pip install wandb`. * `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected. diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py index 6f611c8633..a1576be979 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py @@ -15,7 +15,7 @@ import inspect import math -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import PIL import torch @@ -27,7 +27,10 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from ...models.embeddings import get_3d_rotary_pos_embed from ...pipelines.pipeline_utils import DiffusionPipeline from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler -from ...utils import logging, replace_example_docstring +from ...utils import ( + logging, + replace_example_docstring, +) from ...utils.torch_utils import randn_tensor from ...video_processor import VideoProcessor from .pipeline_output import CogVideoXPipelineOutput @@ -544,10 +547,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): def num_timesteps(self): return self._num_timesteps - @property - def attention_kwargs(self): - return self._attention_kwargs - @property def interrupt(self): return self._interrupt @@ -574,7 +573,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): negative_prompt_embeds: Optional[torch.FloatTensor] = None, output_type: str = "pil", return_dict: bool = True, - attention_kwargs: Optional[Dict[str, Any]] = None, callback_on_step_end: Optional[ Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] ] = None, @@ -638,10 +636,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead of a plain tuple. - attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). callback_on_step_end (`Callable`, *optional*): A function that calls at the end of each denoising steps during the inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, @@ -687,7 +681,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): negative_prompt_embeds, ) self._guidance_scale = guidance_scale - self._attention_kwargs = attention_kwargs self._interrupt = False # 2. Default call parameters