mirror of
https://github.com/huggingface/diffusers.git
synced 2026-01-27 17:22:53 +03:00
[LoRA] allow big CUDA tests to run properly for LoRA (and others) (#9845)
* allow big lora tests to run on the CI. * print * print. * print * print * print * print * more * print * remove print. * remove print * directly place on cuda. * remove pipeline. * remove * fix * fix * spaces * quality * updates * directly place flux controlnet pipeline on cuda. * torch_device instead of cuda. * style * device placement. * fixes * add big gpu marker for mochi; rename test correctly * address feedback * fix --------- Co-authored-by: Aryan <aryan@huggingface.co>
This commit is contained in:
@@ -796,8 +796,8 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
@require_peft_backend
|
||||
@unittest.skip("We cannot run inference on this model with the current CI hardware")
|
||||
# TODO (DN6, sayakpaul): move these tests to a beefier GPU
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
"""internal note: The integration slices were obtained on audace.
|
||||
|
||||
@@ -819,6 +819,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
def tearDown(self):
|
||||
super().tearDown()
|
||||
|
||||
del self.pipeline
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@@ -826,7 +827,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
|
||||
self.pipeline.fuse_lora()
|
||||
self.pipeline.unload_lora_weights()
|
||||
self.pipeline.enable_model_cpu_offload()
|
||||
# Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
|
||||
# run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
|
||||
# `enable_model_cpu_offload()`. We repeat this for the other tests, too.
|
||||
self.pipeline = self.pipeline.to(torch_device)
|
||||
|
||||
prompt = "jon snow eating pizza with ketchup"
|
||||
|
||||
@@ -848,7 +852,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
self.pipeline.load_lora_weights("Norod78/brain-slug-flux")
|
||||
self.pipeline.fuse_lora()
|
||||
self.pipeline.unload_lora_weights()
|
||||
self.pipeline.enable_model_cpu_offload()
|
||||
self.pipeline = self.pipeline.to(torch_device)
|
||||
|
||||
prompt = "The cat with a brain slug earring"
|
||||
out = self.pipeline(
|
||||
@@ -870,7 +874,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
self.pipeline.load_lora_weights("cocktailpeanut/optimus", weight_name="optimus.safetensors")
|
||||
self.pipeline.fuse_lora()
|
||||
self.pipeline.unload_lora_weights()
|
||||
self.pipeline.enable_model_cpu_offload()
|
||||
self.pipeline = self.pipeline.to(torch_device)
|
||||
|
||||
prompt = "optimus is cleaning the house with broomstick"
|
||||
out = self.pipeline(
|
||||
@@ -892,7 +896,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
|
||||
self.pipeline.load_lora_weights("XLabs-AI/flux-lora-collection", weight_name="disney_lora.safetensors")
|
||||
self.pipeline.fuse_lora()
|
||||
self.pipeline.unload_lora_weights()
|
||||
self.pipeline.enable_model_cpu_offload()
|
||||
self.pipeline = self.pipeline.to(torch_device)
|
||||
|
||||
prompt = "A blue jay standing on a large basket of rainbow macarons, disney style"
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ import sys
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer, CLIPTextModelWithProjection, CLIPTokenizer, T5EncoderModel
|
||||
|
||||
@@ -31,9 +32,9 @@ from diffusers.utils.import_utils import is_accelerate_available
|
||||
from diffusers.utils.testing_utils import (
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_big_gpu_with_torch_cuda,
|
||||
require_peft_backend,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -128,11 +129,12 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
|
||||
pass
|
||||
|
||||
|
||||
@slow
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
@require_peft_backend
|
||||
class LoraSD3IntegrationTests(unittest.TestCase):
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class SD3LoraIntegrationTests(unittest.TestCase):
|
||||
pipeline_class = StableDiffusion3Img2ImgPipeline
|
||||
repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
|
||||
|
||||
@@ -166,14 +168,17 @@ class LoraSD3IntegrationTests(unittest.TestCase):
|
||||
|
||||
def test_sd3_img2img_lora(self):
|
||||
pipe = self.pipeline_class.from_pretrained(self.repo_id, torch_dtype=torch.float16)
|
||||
pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2", weight_name="pytorch_lora_weights.safetensors")
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
pipe.load_lora_weights("zwloong/sd3-lora-training-rank16-v2")
|
||||
pipe.fuse_lora()
|
||||
pipe.unload_lora_weights()
|
||||
pipe = pipe.to(torch_device)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
|
||||
image = pipe(**inputs).images[0]
|
||||
image_slice = image[0, -3:, -3:]
|
||||
expected_slice = np.array([0.5396, 0.5776, 0.7432, 0.5151, 0.5586, 0.7383, 0.5537, 0.5933, 0.7153])
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())
|
||||
|
||||
assert max_diff < 1e-4, f"Outputs are not close enough, got {max_diff}"
|
||||
|
||||
@@ -32,9 +32,9 @@ from diffusers.models import FluxControlNetModel
|
||||
from diffusers.utils import load_image
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_big_gpu_with_torch_cuda,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
@@ -204,7 +204,7 @@ class FluxControlNetPipelineFastTests(unittest.TestCase, PipelineTesterMixin):
|
||||
assert (output_height, output_width) == (expected_height, expected_width)
|
||||
|
||||
|
||||
@slow
|
||||
@nightly
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class FluxControlNetPipelineSlowTests(unittest.TestCase):
|
||||
@@ -230,8 +230,7 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase):
|
||||
text_encoder_2=None,
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
).to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
@@ -241,12 +240,12 @@ class FluxControlNetPipelineSlowTests(unittest.TestCase):
|
||||
|
||||
prompt_embeds = torch.load(
|
||||
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
|
||||
)
|
||||
).to(torch_device)
|
||||
pooled_prompt_embeds = torch.load(
|
||||
hf_hub_download(
|
||||
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
|
||||
)
|
||||
)
|
||||
).to(torch_device)
|
||||
|
||||
output = pipe(
|
||||
prompt_embeds=prompt_embeds,
|
||||
|
||||
@@ -9,6 +9,7 @@ from transformers import AutoTokenizer, CLIPTextConfig, CLIPTextModel, CLIPToken
|
||||
|
||||
from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_big_gpu_with_torch_cuda,
|
||||
slow,
|
||||
@@ -209,7 +210,7 @@ class FluxPipelineFastTests(unittest.TestCase, PipelineTesterMixin, FluxIPAdapte
|
||||
assert (output_height, output_width) == (expected_height, expected_width)
|
||||
|
||||
|
||||
@slow
|
||||
@nightly
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class FluxPipelineSlowTests(unittest.TestCase):
|
||||
@@ -227,19 +228,16 @@ class FluxPipelineSlowTests(unittest.TestCase):
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
def get_inputs(self, device, seed=0):
|
||||
if str(device).startswith("mps"):
|
||||
generator = torch.manual_seed(seed)
|
||||
else:
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
generator = torch.Generator(device="cpu").manual_seed(seed)
|
||||
|
||||
prompt_embeds = torch.load(
|
||||
hf_hub_download(repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/prompt_embeds.pt")
|
||||
)
|
||||
).to(torch_device)
|
||||
pooled_prompt_embeds = torch.load(
|
||||
hf_hub_download(
|
||||
repo_id="diffusers/test-slices", repo_type="dataset", filename="flux/pooled_prompt_embeds.pt"
|
||||
)
|
||||
)
|
||||
).to(torch_device)
|
||||
return {
|
||||
"prompt_embeds": prompt_embeds,
|
||||
"pooled_prompt_embeds": pooled_prompt_embeds,
|
||||
@@ -253,8 +251,7 @@ class FluxPipelineSlowTests(unittest.TestCase):
|
||||
def test_flux_inference(self):
|
||||
pipe = self.pipeline_class.from_pretrained(
|
||||
self.repo_id, torch_dtype=torch.bfloat16, text_encoder=None, text_encoder_2=None
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
).to(torch_device)
|
||||
|
||||
inputs = self.get_inputs(torch_device)
|
||||
|
||||
|
||||
@@ -17,15 +17,17 @@ import inspect
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoTokenizer, T5EncoderModel
|
||||
|
||||
from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
|
||||
from diffusers.utils.testing_utils import (
|
||||
enable_full_determinism,
|
||||
nightly,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_big_gpu_with_torch_cuda,
|
||||
require_torch_gpu,
|
||||
slow,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
@@ -260,8 +262,10 @@ class MochiPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
)
|
||||
|
||||
|
||||
@slow
|
||||
@nightly
|
||||
@require_torch_gpu
|
||||
@require_big_gpu_with_torch_cuda
|
||||
@pytest.mark.big_gpu_with_torch_cuda
|
||||
class MochiPipelineIntegrationTests(unittest.TestCase):
|
||||
prompt = "A painting of a squirrel eating a burger."
|
||||
|
||||
@@ -293,7 +297,7 @@ class MochiPipelineIntegrationTests(unittest.TestCase):
|
||||
).frames
|
||||
|
||||
video = videos[0]
|
||||
expected_video = torch.randn(1, 16, 480, 848, 3).numpy()
|
||||
expected_video = torch.randn(1, 19, 480, 848, 3).numpy()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(video, expected_video)
|
||||
assert max_diff < 1e-3, f"Max diff is too high. got {video}"
|
||||
|
||||
Reference in New Issue
Block a user