From a6f043a80f4951bb65ddb05769723fddb0303a9b Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 10 Jan 2025 12:50:24 +0530 Subject: [PATCH] [LoRA] allow big CUDA tests to run properly for LoRA (and others) (#9845) * allow big lora tests to run on the CI. * print * print. * print * print * print * print * more * print * remove print. * remove print * directly place on cuda. * remove pipeline. * remove * fix * fix * spaces * quality * updates * directly place flux controlnet pipeline on cuda. * torch_device instead of cuda. * style * device placement. * fixes * add big gpu marker for mochi; rename test correctly * address feedback * fix --------- Co-authored-by: Aryan --- tests/lora/test_lora_layers_flux.py | 16 ++++++++++------ tests/lora/test_lora_layers_sd3.py | 15 ++++++++++----- .../controlnet_flux/test_controlnet_flux.py | 11 +++++------ tests/pipelines/flux/test_pipeline_flux.py | 15 ++++++--------- tests/pipelines/mochi/test_mochi.py | 10 +++++++--- 5 files changed, 38 insertions(+), 29 deletions(-) diff --git a/tests/lora/test_lora_layers_flux.py b/tests/lora/test_lora_layers_flux.py index ace0ad6b60..0a9c4166fe 100644 --- a/tests/lora/test_lora_layers_flux.py +++ b/tests/lora/test_lora_layers_flux.py @@ -796,8 +796,8 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): @nightly @require_torch_gpu @require_peft_backend -@unittest.skip("We cannot run inference on this model with the current CI hardware") -# TODO (DN6, sayakpaul): move these tests to a beefier GPU +@require_big_gpu_with_torch_cuda +@pytest.mark.big_gpu_with_torch_cuda class FluxLoRAIntegrationTests(unittest.TestCase): """internal note: The integration slices were obtained on audace. @@ -819,6 +819,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): def tearDown(self): super().tearDown() + del self.pipeline gc.collect() torch.cuda.empty_cache() @@ -826,7 +827,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase): self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors") self.pipeline.fuse_lora() self.pipeline.unload_lora_weights() - self.pipeline.enable_model_cpu_offload() + # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI + # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with + # `enable_model_cpu_offload()`. We repeat this for the other tests, too. + self.pipeline = self.pipeline.to(torch_device) prompt = "jon snow eating pizza with ketchup" @@ -848,7 +852,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): self.pipeline.load_lora_weights("Norod78/brain-slug-flux") self.pipeline.fuse_lora() self.pipeline.unload_lora_weights() - self.pipeline.enable_model_cpu_offload() + self.pipeline = self.pipeline.to(torch_device) prompt = "The cat with a brain slug earring" out = self.pipeline( @@ -870,7 +874,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors") self.pipeline.fuse_lora() self.pipeline.unload_lora_weights() - self.pipeline.enable_model_cpu_offload() + self.pipeline = self.pipeline.to(torch_device) prompt = "optimus is cleaning the house with broomstick" out = self.pipeline( @@ -892,7 +896,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase): self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors") self.pipeline.fuse_lora() self.pipeline.unload_lora_weights() - self.pipeline.enable_model_cpu_offload() + self.pipeline = self.pipeline.to(torch_device) prompt = "A blue jay standing on a large basket of rainbow macarons, disney style" diff --git a/tests/lora/test_lora_layers_sd3.py b/tests/lora/test_lora_layers_sd3.py index 40383e3f1e..448874191d 100644 --- a/tests/lora/test_lora_layers_sd3.py +++ b/tests/lora/test_lora_layers_sd3.py @@ -17,6 +17,7 @@ import sys import unittest import numpy as np +import pytest import torch from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel @@ -31,9 +32,9 @@ from diffusers.utils.import_utils import is_accelerate_available from diffusers.utils.testing_utils import ( nightly, numpy_cosine_similarity_distance, + require_big_gpu_with_torch_cuda, require_peft_backend, require_torch_gpu, - slow, torch_device, ) @@ -128,11 +129,12 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests): pass -@slow @nightly @require_torch_gpu @require_peft_backend -class LoraSD3IntegrationTests(unittest.TestCase): +@require_big_gpu_with_torch_cuda +@pytest.mark.big_gpu_with_torch_cuda +class SD3LoraIntegrationTests(unittest.TestCase): pipeline_class = StableDiffusion3Img2ImgPipeline repo_id = "stabilityai/stable-diffusion-3-medium-diffusers" @@ -166,14 +168,17 @@ class LoraSD3IntegrationTests(unittest.TestCase): def test_sd3_img2img_lora(self): pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16) - pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2", weight_name="pytorch_lora_weights.safetensors") - pipe.enable_sequential_cpu_offload() + pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2") + pipe.fuse_lora() + pipe.unload_lora_weights() + pipe = pipe.to(torch_device) inputs = self.get_inputs(torch_device) image = pipe(**inputs).images[0] image_slice = image[0, -3:, -3:] expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153]) + max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten()) assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}" diff --git a/tests/pipelines/controlnet_flux/test_controlnet_flux.py b/tests/pipelines/controlnet_flux/test_controlnet_flux.py index 8202424e7f..5e856b125f 100644 --- a/tests/pipelines/controlnet_flux/test_controlnet_flux.py +++ b/tests/pipelines/controlnet_flux/test_controlnet_flux.py @@ -32,9 +32,9 @@ from diffusers.models import FluxControlNetModel from diffusers.utils import load_image from diffusers.utils.testing_utils import ( enable_full_determinism, + nightly, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, - slow, torch_device, ) from diffusers.utils.torch_utils import randn_tensor @@ -204,7 +204,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin): assert (output_height, output_width) == (expected_height, expected_width) -@slow +@nightly @require_big_gpu_with_torch_cuda @pytest.mark.big_gpu_with_torch_cuda class FluxControlNetPipelineSlowTests(unittest.TestCase): @@ -230,8 +230,7 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): text_encoder_2=None, controlnet=controlnet, torch_dtype=torch.bfloat16, - ) - pipe.enable_model_cpu_offload() + ).to(torch_device) pipe.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) @@ -241,12 +240,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase): prompt_embeds = torch.load( hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt") - ) + ).to(torch_device) pooled_prompt_embeds = torch.load( hf_hub_download( repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt" ) - ) + ).to(torch_device) output = pipe( prompt_embeds=prompt_embeds, diff --git a/tests/pipelines/flux/test_pipeline_flux.py b/tests/pipelines/flux/test_pipeline_flux.py index 7981e6c2a9..ab36333c40 100644 --- a/tests/pipelines/flux/test_pipeline_flux.py +++ b/tests/pipelines/flux/test_pipeline_flux.py @@ -9,6 +9,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPToken from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel from diffusers.utils.testing_utils import ( + nightly, numpy_cosine_similarity_distance, require_big_gpu_with_torch_cuda, slow, @@ -209,7 +210,7 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapte assert (output_height, output_width) == (expected_height, expected_width) -@slow +@nightly @require_big_gpu_with_torch_cuda @pytest.mark.big_gpu_with_torch_cuda class FluxPipelineSlowTests(unittest.TestCase): @@ -227,19 +228,16 @@ class FluxPipelineSlowTests(unittest.TestCase): torch.cuda.empty_cache() def get_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device="cpu").manual_seed(seed) + generator = torch.Generator(device="cpu").manual_seed(seed) prompt_embeds = torch.load( hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt") - ) + ).to(torch_device) pooled_prompt_embeds = torch.load( hf_hub_download( repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt" ) - ) + ).to(torch_device) return { "prompt_embeds": prompt_embeds, "pooled_prompt_embeds": pooled_prompt_embeds, @@ -253,8 +251,7 @@ class FluxPipelineSlowTests(unittest.TestCase): def test_flux_inference(self): pipe = self.pipeline_class.from_pretrained( self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None - ) - pipe.enable_model_cpu_offload() + ).to(torch_device) inputs = self.get_inputs(torch_device) diff --git a/tests/pipelines/mochi/test_mochi.py b/tests/pipelines/mochi/test_mochi.py index bbcf6d210c..c9df578589 100644 --- a/tests/pipelines/mochi/test_mochi.py +++ b/tests/pipelines/mochi/test_mochi.py @@ -17,15 +17,17 @@ import inspect import unittest import numpy as np +import pytest import torch from transformers import AutoTokenizer, T5EncoderModel from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel from diffusers.utils.testing_utils import ( enable_full_determinism, + nightly, numpy_cosine_similarity_distance, + require_big_gpu_with_torch_cuda, require_torch_gpu, - slow, torch_device, ) @@ -260,8 +262,10 @@ class MochiPipelineFastTests(PipelineTesterMixin, unittest.TestCase): ) -@slow +@nightly @require_torch_gpu +@require_big_gpu_with_torch_cuda +@pytest.mark.big_gpu_with_torch_cuda class MochiPipelineIntegrationTests(unittest.TestCase): prompt = "A painting of a squirrel eating a burger." @@ -293,7 +297,7 @@ class MochiPipelineIntegrationTests(unittest.TestCase): ).frames video = videos[0] - expected_video = torch.randn(1, 16, 480, 848, 3).numpy() + expected_video = torch.randn(1, 19, 480, 848, 3).numpy() max_diff = numpy_cosine_similarity_distance(video, expected_video) assert max_diff < 1e-3, f"Max diff is too high. got {video}"