Fix [core/GLIGEN]: TypeError when iterating over 0-d tensor with In-painting mode when EulerAncestralDiscreteScheduler is used (#5305)

* fix(gligen_inpaint_pipeline): 🐛 Wrap the timestep() 0-d tensor in a list to convert to 1-d tensor. This avoids the TypeError caused by trying to directly iterate over a 0-dimensional tensor in the denoising stage * test(gligen/gligen_text_image): unit test using the EulerAncestralDiscreteScheduler --------- Co-authored-by: zhen-hao.chu <zhen-hao.chu@vitrox.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>
2026-01-27 17:22:53 +03:00 · 2023-10-09 15:54:01 +08:00
parent 0513a8cfd8
commit 6bd55b54bc
5 changed files with 47 additions and 5 deletions
--- a/examples/custom_diffusion/train_custom_diffusion.py
+++ b/examples/custom_diffusion/train_custom_diffusion.py
@@ -207,7 +207,7 @@ class CustomDiffusionDataset(Dataset):
                    with open(concept["class_prompt"], "r") as f:
                        class_prompt = f.read().splitlines()

-                class_img_path = [(x, y) for (x, y) in zip(class_images_path, class_prompt)]
+                class_img_path = list(zip(class_images_path, class_prompt))
                self.class_images_path.extend(class_img_path[:num_class_images])

        random.shuffle(self.instance_images_path)
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen.py
@@ -803,7 +803,9 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline):

                if gligen_inpaint_image is not None:
                    gligen_inpaint_latent_with_noise = (
-                        self.scheduler.add_noise(gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), t)
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
                        .expand(latents.shape[0], -1, -1, -1)
                        .clone()
                    )
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_gligen_text_image.py
@@ -965,7 +965,9 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):

                if gligen_inpaint_image is not None:
                    gligen_inpaint_latent_with_noise = (
-                        self.scheduler.add_noise(gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), t)
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
                        .expand(latents.shape[0], -1, -1, -1)
                        .clone()
                    )
--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen.py
@@ -22,6 +22,7 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
 from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
    StableDiffusionGLIGENPipeline,
    UNet2DConditionModel,
 )
@@ -120,7 +121,7 @@ class GligenPipelineFastTests(
        }
        return inputs

-    def test_gligen(self):
+    def test_stable_diffusion_gligen_default_case(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionGLIGENPipeline(**components)
@@ -136,6 +137,24 @@ class GligenPipelineFastTests(

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

+    def test_stable_diffusion_gligen_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENPipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        output = sd_pipe(**inputs)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+        expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
    def test_attention_slicing_forward_pass(self):
        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py
+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_gligen_text_image.py
@@ -29,6 +29,7 @@ from transformers import (
 from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
+    EulerAncestralDiscreteScheduler,
    StableDiffusionGLIGENTextImagePipeline,
    UNet2DConditionModel,
 )
@@ -150,7 +151,7 @@ class GligenTextImagePipelineFastTests(
        }
        return inputs

-    def test_gligen(self):
+    def test_stable_diffusion_gligen_text_image_default_case(self):
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
        components = self.get_dummy_components()
        sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
@@ -166,6 +167,24 @@ class GligenTextImagePipelineFastTests(

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

+    def test_stable_diffusion_gligen_k_euler_ancestral(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = StableDiffusionGLIGENTextImagePipeline(**components)
+        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 64, 64, 3)
+
+        expected_slice = np.array([0.425, 0.494, 0.429, 0.469, 0.525, 0.417, 0.533, 0.5, 0.47])
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
    def test_attention_slicing_forward_pass(self):
        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)