From 5893fdcbfc794e8b7ad25a86ce4e26f3ee84fdfd Mon Sep 17 00:00:00 2001 From: Aryan Date: Tue, 17 Sep 2024 22:46:34 +0200 Subject: [PATCH] update --- .../pipelines/cogvideo/pipeline_cogvideox.py | 27 +----------------- .../pipeline_cogvideox_image2video.py | 28 +------------------ .../pipeline_cogvideox_video2video.py | 25 +---------------- 3 files changed, 3 insertions(+), 77 deletions(-) diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py index 4428137f15..02497e77ed 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox.py @@ -26,13 +26,7 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from ...models.embeddings import get_3d_rotary_pos_embed from ...pipelines.pipeline_utils import DiffusionPipeline from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler -from ...utils import ( - USE_PEFT_BACKEND, - logging, - replace_example_docstring, - scale_lora_layers, - unscale_lora_layers, -) +from ...utils import logging, replace_example_docstring from ...utils.torch_utils import randn_tensor from ...video_processor import VideoProcessor from .pipeline_output import CogVideoXPipelineOutput @@ -250,7 +244,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): max_sequence_length: int = 226, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -277,20 +270,9 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): torch device dtype: (`torch.dtype`, *optional*): torch dtype - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, CogVideoXLoraLoaderMixin): - self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - if self.text_encoder is not None and USE_PEFT_BACKEND: - scale_lora_layers(self.text_encoder, lora_scale) - prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -330,11 +312,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): dtype=dtype, ) - if self.text_encoder is not None: - if isinstance(self, CogVideoXLoraLoaderMixin) and USE_PEFT_BACKEND: - # Retrieve the original scale by scaling back the LoRA layers - unscale_lora_layers(self.text_encoder, lora_scale) - return prompt_embeds, negative_prompt_embeds def prepare_latents( @@ -644,7 +621,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt - lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, negative_prompt, @@ -654,7 +630,6 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin): negative_prompt_embeds=negative_prompt_embeds, max_sequence_length=max_sequence_length, device=device, - lora_scale=lora_scale, ) if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py index 9726944ee0..6f611c8633 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_image2video.py @@ -23,18 +23,11 @@ from transformers import T5EncoderModel, T5Tokenizer from ...callbacks import MultiPipelineCallbacks, PipelineCallback from ...image_processor import PipelineImageInput -from ...loaders import CogVideoXLoraLoaderMixin from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from ...models.embeddings import get_3d_rotary_pos_embed from ...pipelines.pipeline_utils import DiffusionPipeline from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler -from ...utils import ( - USE_PEFT_BACKEND, - logging, - replace_example_docstring, - scale_lora_layers, - unscale_lora_layers, -) +from ...utils import logging, replace_example_docstring from ...utils.torch_utils import randn_tensor from ...video_processor import VideoProcessor from .pipeline_output import CogVideoXPipelineOutput @@ -269,7 +262,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): max_sequence_length: int = 226, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -296,20 +288,9 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): torch device dtype: (`torch.dtype`, *optional*): torch dtype - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, CogVideoXLoraLoaderMixin): - self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - if self.text_encoder is not None and USE_PEFT_BACKEND: - scale_lora_layers(self.text_encoder, lora_scale) - prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -349,11 +330,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): dtype=dtype, ) - if self.text_encoder is not None: - if isinstance(self, CogVideoXLoraLoaderMixin) and USE_PEFT_BACKEND: - # Retrieve the original scale by scaling back the LoRA layers - unscale_lora_layers(self.text_encoder, lora_scale) - return prompt_embeds, negative_prompt_embeds def prepare_latents( @@ -730,7 +706,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt - lora_scale = self.attention_kwargs.get("scale", None) if self.attention_kwargs is not None else None prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt=prompt, negative_prompt=negative_prompt, @@ -740,7 +715,6 @@ class CogVideoXImageToVideoPipeline(DiffusionPipeline): negative_prompt_embeds=negative_prompt_embeds, max_sequence_length=max_sequence_length, device=device, - lora_scale=lora_scale, ) if do_classifier_free_guidance: prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) diff --git a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py index 7e4310cae8..92d5eeeef8 100644 --- a/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py +++ b/src/diffusers/pipelines/cogvideo/pipeline_cogvideox_video2video.py @@ -27,13 +27,7 @@ from ...models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel from ...models.embeddings import get_3d_rotary_pos_embed from ...pipelines.pipeline_utils import DiffusionPipeline from ...schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler -from ...utils import ( - USE_PEFT_BACKEND, - logging, - replace_example_docstring, - scale_lora_layers, - unscale_lora_layers, -) +from ...utils import logging, replace_example_docstring from ...utils.torch_utils import randn_tensor from ...video_processor import VideoProcessor from .pipeline_output import CogVideoXPipelineOutput @@ -274,7 +268,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) max_sequence_length: int = 226, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, - lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -301,20 +294,9 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) torch device dtype: (`torch.dtype`, *optional*): torch dtype - lora_scale (`float`, *optional*): - A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ device = device or self._execution_device - # set lora scale so that monkey patched LoRA - # function of text encoder can correctly access it - if lora_scale is not None and isinstance(self, CogVideoXLoraLoaderMixin): - self._lora_scale = lora_scale - - # dynamically adjust the LoRA scale - if self.text_encoder is not None and USE_PEFT_BACKEND: - scale_lora_layers(self.text_encoder, lora_scale) - prompt = [prompt] if isinstance(prompt, str) else prompt if prompt is not None: batch_size = len(prompt) @@ -354,11 +336,6 @@ class CogVideoXVideoToVideoPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin) dtype=dtype, ) - if self.text_encoder is not None: - if isinstance(self, CogVideoXLoraLoaderMixin) and USE_PEFT_BACKEND: - # Retrieve the original scale by scaling back the LoRA layers - unscale_lora_layers(self.text_encoder, lora_scale) - return prompt_embeds, negative_prompt_embeds def prepare_latents(