From 829545df04f3004eae0a90d909185881aead71e8 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Tue, 10 Jun 2025 23:31:09 +0200
Subject: [PATCH] rename text2image pipeline

---
 scripts/convert_cosmos_to_diffusers.py                 | 10 +++++++---
 src/diffusers/__init__.py                              |  4 ++--
 src/diffusers/pipelines/__init__.py                    |  4 ++--
 src/diffusers/pipelines/cosmos/__init__.py             |  4 ++--
 ...os_text2image.py => pipeline_cosmos2_text2image.py} | 10 +++++++---
 .../utils/dummy_torch_and_transformers_objects.py      |  2 +-
 6 files changed, 21 insertions(+), 13 deletions(-)
 rename src/diffusers/pipelines/cosmos/{pipeline_cosmos_text2image.py => pipeline_cosmos2_text2image.py} (98%)

diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
index 8797f17d75..0a96e99da0 100644
--- a/scripts/convert_cosmos_to_diffusers.py
+++ b/scripts/convert_cosmos_to_diffusers.py
@@ -10,9 +10,11 @@ from transformers import T5EncoderModel, T5TokenizerFast
 from diffusers import (
     AutoencoderKLCosmos,
     AutoencoderKLWan,
-    CosmosTextToImagePipeline,
+    Cosmos2TextToImagePipeline,
+    Cosmos2VideoToWorldPipeline,
     CosmosTextToWorldPipeline,
     CosmosTransformer3DModel,
+    CosmosVideoToWorldPipeline,
     EDMEulerScheduler,
 )
 
@@ -412,7 +414,8 @@ def save_pipeline_cosmos_1_0(args, transformer, vae):
         final_sigmas_type="sigma_min",
     )
 
-    pipe = CosmosTextToWorldPipeline(
+    pipe_cls = CosmosTextToWorldPipeline if "Text2World" in args.transformer_type else CosmosVideoToWorldPipeline
+    pipe = pipe_cls(
         text_encoder=text_encoder,
         tokenizer=tokenizer,
         transformer=transformer,
@@ -438,7 +441,8 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
         use_flow_sigmas=True,
     )
 
-    pipe = CosmosTextToImagePipeline(
+    pipe_cls = Cosmos2TextToImagePipeline if "Text2Image" in args.transformer_type else Cosmos2VideoToWorldPipeline
+    pipe = pipe_cls(
         text_encoder=text_encoder,
         tokenizer=tokenizer,
         transformer=transformer,
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 7fb33f76e5..393af8ad2b 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -361,8 +361,8 @@ else:
             "CogView4ControlPipeline",
             "CogView4Pipeline",
             "ConsisIDPipeline",
+            "Cosmos2TextToImagePipeline",
             "Cosmos2VideoToWorldPipeline",
-            "CosmosTextToImagePipeline",
             "CosmosTextToWorldPipeline",
             "CosmosVideoToWorldPipeline",
             "CycleDiffusionPipeline",
@@ -951,8 +951,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             CogView4ControlPipeline,
             CogView4Pipeline,
             ConsisIDPipeline,
+            Cosmos2TextToImagePipeline,
             Cosmos2VideoToWorldPipeline,
-            CosmosTextToImagePipeline,
             CosmosTextToWorldPipeline,
             CosmosVideoToWorldPipeline,
             CycleDiffusionPipeline,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 574c150213..47aba3a14e 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -158,7 +158,7 @@ else:
     _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
     _import_structure["consisid"] = ["ConsisIDPipeline"]
     _import_structure["cosmos"] = [
-        "CosmosTextToImagePipeline",
+        "Cosmos2TextToImagePipeline",
         "CosmosTextToWorldPipeline",
         "CosmosVideoToWorldPipeline",
         "Cosmos2VideoToWorldPipeline",
@@ -565,8 +565,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             StableDiffusionXLControlNetXSPipeline,
         )
         from .cosmos import (
+            Cosmos2TextToImagePipeline,
             Cosmos2VideoToWorldPipeline,
-            CosmosTextToImagePipeline,
             CosmosTextToWorldPipeline,
             CosmosVideoToWorldPipeline,
         )
diff --git a/src/diffusers/pipelines/cosmos/__init__.py b/src/diffusers/pipelines/cosmos/__init__.py
index 88de165801..2833c89abd 100644
--- a/src/diffusers/pipelines/cosmos/__init__.py
+++ b/src/diffusers/pipelines/cosmos/__init__.py
@@ -22,8 +22,8 @@ except OptionalDependencyNotAvailable:
 
     _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
+    _import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
     _import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
-    _import_structure["pipeline_cosmos_text2image"] = ["CosmosTextToImagePipeline"]
     _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
     _import_structure["pipeline_cosmos_video2world"] = ["CosmosVideoToWorldPipeline"]
 
@@ -35,8 +35,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
     except OptionalDependencyNotAvailable:
         from ...utils.dummy_torch_and_transformers_objects import *
     else:
+        from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
         from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
-        from .pipeline_cosmos_text2image import CosmosTextToImagePipeline
         from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
         from .pipeline_cosmos_video2world import CosmosVideoToWorldPipeline
 
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
similarity index 98%
rename from src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py
rename to src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
index 0399d9ccf5..1d00c12ffa 100644
--- a/src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py
+++ b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py
@@ -54,11 +54,11 @@ EXAMPLE_DOC_STRING = """
     Examples:
         ```python
         >>> import torch
-        >>> from diffusers import CosmosTextToImagePipeline
+        >>> from diffusers import Cosmos2TextToImagePipeline
 
         >>> # Available checkpoints: nvidia/Cosmos-Predict2-2B-Text2Image, nvidia/Cosmos-Predict2-14B-Text2Image
         >>> model_id = "nvidia/Cosmos-Predict2-2B-Text2Image"
-        >>> pipe = CosmosTextToImagePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
+        >>> pipe = Cosmos2TextToImagePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
         >>> pipe.to("cuda")
 
         >>> prompt = "A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess."
@@ -132,7 +132,7 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 
 
-class CosmosTextToImagePipeline(DiffusionPipeline):
+class Cosmos2TextToImagePipeline(DiffusionPipeline):
     r"""
     Pipeline for text-to-image generation using [Cosmos](https://github.com/NVIDIA/Cosmos).
 
@@ -637,6 +637,10 @@ class CosmosTextToImagePipeline(DiffusionPipeline):
             else:
                 video = self.video_processor.postprocess_video(video, output_type=output_type)
             image = [batch[0] for batch in video]
+            if isinstance(video, torch.Tensor):
+                image = torch.stack(image)
+            elif isinstance(video, np.ndarray):
+                image = np.stack(image)
         else:
             image = latents[:, :, 0]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 1ee5368b1f..026088b8e7 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -407,7 +407,7 @@ class ConsisIDPipeline(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 
 
-class CosmosTextToImagePipeline(metaclass=DummyObject):
+class Cosmos2TextToImagePipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
     def __init__(self, *args, **kwargs):