up

2026-01-27 17:22:53 +03:00 · 2026-01-15 08:50:35 +05:30
parent 7ad97d492d
commit 765eb50ff1
12 changed files with 88 additions and 97 deletions
--- a/src/diffusers/models/transformers/transformer_glm_image.py
+++ b/src/diffusers/models/transformers/transformer_glm_image.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any

 import torch
 import torch.nn as nn
@@ -104,7 +104,7 @@ class GlmImageAdaLayerNormZero(nn.Module):

    def forward(
        self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        dtype = hidden_states.dtype
        norm_hidden_states = self.norm(hidden_states).to(dtype=dtype)
        norm_encoder_hidden_states = self.norm_context(encoder_hidden_states).to(dtype=dtype)
@@ -148,7 +148,7 @@ class GlmImageLayerKVCache:
    def __init__(self):
        self.k_cache = None
        self.v_cache = None
-        self.mode: Optional[str] = None  # "write", "read", "skip"
+        self.mode: str | None = None  # "write", "read", "skip"

    def store(self, k: torch.Tensor, v: torch.Tensor):
        if self.k_cache is None:
@@ -186,7 +186,7 @@ class GlmImageKVCache:
    def __getitem__(self, layer_idx: int) -> GlmImageLayerKVCache:
        return self.caches[layer_idx]

-    def set_mode(self, mode: Optional[str]):
+    def set_mode(self, mode: str):
        if mode is not None and mode not in ["write", "read", "skip"]:
            raise ValueError(f"Invalid mode: {mode}, must be one of 'write', 'read', 'skip'")
        for cache in self.caches:
@@ -218,10 +218,10 @@ class GlmImageAttnProcessor:
        attn: Attention,
        hidden_states: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        kv_cache: Optional[GlmImageLayerKVCache] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        attention_mask: torch.Tensor | None = None,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
+        kv_cache: GlmImageLayerKVCache | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        dtype = encoder_hidden_states.dtype

        batch_size, text_seq_length, embed_dim = encoder_hidden_states.shape
@@ -330,14 +330,12 @@ class GlmImageTransformerBlock(nn.Module):
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[
-            Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
-        ] = None,
-        attention_mask: Optional[Dict[str, torch.Tensor]] = None,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        kv_cache: Optional[GlmImageLayerKVCache] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        temb: torch.Tensor | None = None,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        attention_mask: dict[str, torch.Tensor] | None = None,
+        attention_kwargs: dict[str, Any] | None = None,
+        kv_cache: GlmImageLayerKVCache | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        # 1. Timestep conditioning
        (
            norm_hidden_states,
@@ -388,7 +386,7 @@ class GlmImageRotaryPosEmbed(nn.Module):
        self.patch_size = patch_size
        self.theta = theta

-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        batch_size, num_channels, height, width = hidden_states.shape
        height, width = height // self.patch_size, width // self.patch_size

@@ -553,14 +551,12 @@ class GlmImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach
        timestep: torch.LongTensor,
        target_size: torch.Tensor,
        crop_coords: torch.Tensor,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
+        attention_kwargs: dict[str, Any] | None = None,
        return_dict: bool = True,
-        attention_mask: Optional[torch.Tensor] = None,
-        kv_caches: Optional[GlmImageKVCache] = None,
-        image_rotary_emb: Optional[
-            Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
-        ] = None,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+        attention_mask: torch.Tensor | None = None,
+        kv_caches: GlmImageKVCache | None = None,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None,
+    ) -> tuple[torch.Tensor] | Transformer2DModelOutput:
        batch_size, num_channels, height, width = hidden_states.shape

        # 1. RoPE
--- a/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
+++ b/src/diffusers/pipelines/glm_image/pipeline_glm_image.py
@@ -15,7 +15,7 @@

 import inspect
 import re
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable

 import numpy as np
 import PIL
@@ -79,10 +79,10 @@ def calculate_shift(
 # Copied from diffusers.pipelines.cogview4.pipeline_cogview4.retrieve_timesteps
 def retrieve_timesteps(
    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
+    num_inference_steps: int | None = None,
+    device: str | torch.device | None = None,
+    timesteps: list[int] | None = None,
+    sigmas: list[float] | None = None,
    **kwargs,
 ):
    r"""
@@ -97,10 +97,10 @@ def retrieve_timesteps(
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
+        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
+        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

@@ -146,7 +146,7 @@ def retrieve_timesteps(

 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+    encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
 ):
    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
        return encoder_output.latent_dist.sample(generator)
@@ -265,8 +265,8 @@ class GlmImagePipeline(DiffusionPipeline):
        prompt: str,
        height: int,
        width: int,
-        image: Optional[List[PIL.Image.Image]] = None,
-        device: Optional[torch.device] = None,
+        image: list[PIL.Image.Image] | None = None,
+        device: torch.device | None = None,
    ):
        device = device or self._execution_device
        is_text_to_image = image is None or len(image) == 0
@@ -327,10 +327,10 @@ class GlmImagePipeline(DiffusionPipeline):

    def _get_glyph_embeds(
        self,
-        prompt: Union[str, List[str]] = None,
+        prompt: str | list[str] = None,
        max_sequence_length: int = 2048,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
    ):
        device = device or self._execution_device
        dtype = dtype or self.text_encoder.dtype
@@ -359,20 +359,20 @@ class GlmImagePipeline(DiffusionPipeline):

    def encode_prompt(
        self,
-        prompt: Union[str, List[str]],
+        prompt: str | list[str],
        do_classifier_free_guidance: bool = True,
        num_images_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
+        prompt_embeds: torch.Tensor | None = None,
+        negative_prompt_embeds: torch.Tensor | None = None,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
        max_sequence_length: int = 2048,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
@@ -527,40 +527,43 @@ class GlmImagePipeline(DiffusionPipeline):
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Optional[
-            Union[
-                torch.Tensor, PIL.Image.Image, np.ndarray, List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray]
-            ]
-        ] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        prompt: str | list[str] | None = None,
+        image: torch.Tensor
+        | PIL.Image.Image
+        | np.ndarray
+        | list[torch.Tensor]
+        | list[PIL.Image.Image]
+        | list[np.ndarray]
+        | None = None,
+        height: int | None = None,
+        width: int | None = None,
        num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
-        sigmas: Optional[List[float]] = None,
+        timesteps: list[int] | None = None,
+        sigmas: list[float] | None = None,
        guidance_scale: float = 1.5,
        num_images_per_prompt: int = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        prior_token_ids: Optional[torch.FloatTensor] = None,
-        prior_image_token_ids: Optional[torch.Tensor] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        generator: torch.Generator | list[torch.Generator] | None = None,
+        latents: torch.FloatTensor | None = None,
+        prompt_embeds: torch.Tensor | None = None,
+        negative_prompt_embeds: torch.Tensor | None = None,
+        prior_token_ids: torch.FloatTensor | None = None,
+        prior_image_token_ids: torch.Tensor | None = None,
+        crops_coords_top_left: tuple[int, int] = (0, 0),
        output_type: str = "pil",
        return_dict: bool = True,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        attention_kwargs: dict[str, Any] | None = None,
+        callback_on_step_end: Callable[[int, int, dict], None]
+        | PipelineCallback
+        | MultiPipelineCallbacks
+        | None = None,
+        callback_on_step_end_tensor_inputs: list[str] = ["latents"],
        max_sequence_length: int = 2048,
-    ) -> Union[GlmImagePipelineOutput, Tuple]:
+    ) -> GlmImagePipelineOutput | tuple:
        """
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
+            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. Must contain shape info in the format '<sop>H
                W<eop>' where H and W are token dimensions (d32). Example: "A beautiful sunset<sop>36 24<eop>"
                generates a 1152x768 image.
--- a/src/diffusers/pipelines/glm_image/pipeline_output.py
+++ b/src/diffusers/pipelines/glm_image/pipeline_output.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-from typing import List, Union

 import numpy as np
 import PIL.Image
@@ -18,4 +17,4 @@ class GlmImagePipelineOutput(BaseOutput):
            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
    """

-    images: Union[List[PIL.Image.Image], np.ndarray]
+    images: list[PIL.Image.Image] | np.ndarray
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -128,7 +128,7 @@ class ImagePipelineOutput(BaseOutput):
            num_channels)`.
    """

-    images: Union[List[PIL.Image.Image], np.ndarray]
+    images: list[PIL.Image.Image] | np.ndarray


@dataclass
@@ -1171,7 +1171,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                accelerate.hooks.remove_hook_from_module(model, recurse=True)
        self._all_hooks = []

-    def enable_model_cpu_offload(self, gpu_id: int | None = None, device: Union[torch.device, str] = None):
+    def enable_model_cpu_offload(self, gpu_id: int | None = None, device: torch.device | str = None):
        r"""
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
@@ -1289,7 +1289,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        # make sure the model is in the same state as before calling it
        self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))

-    def enable_sequential_cpu_offload(self, gpu_id: int | None = None, device: Union[torch.device, str] = None):
+    def enable_sequential_cpu_offload(self, gpu_id: int | None = None, device: torch.device | str = None):
        r"""
        Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
        dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
@@ -1498,7 +1498,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):

    @classmethod
    @validate_hf_hub_args
-    def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
+    def download(cls, pretrained_model_name, **kwargs) -> str | os.PathLike:
        r"""
        Download and cache a PyTorch diffusion pipeline from pretrained pipeline weights.

@@ -1880,7 +1880,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        return signature_types

    @property
-    def parameters(self) -> Dict[str, Any]:
+    def parameters(self) -> dict[str, Any]:
        r"""
        The `self.parameters` property can be useful to run different pipelines with the same weights and
        configurations without reallocating additional memory.
@@ -1910,7 +1910,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        return pipeline_parameters

    @property
-    def components(self) -> Dict[str, Any]:
+    def components(self) -> dict[str, Any]:
        r"""
        The `self.components` property can be useful to run different pipelines with the same weights and
        configurations without reallocating additional memory.
--- a/tests/fixtures/custom_pipeline/pipeline.py
+++ b/tests/fixtures/custom_pipeline/pipeline.py
@@ -15,8 +15,6 @@
 # limitations under the License.


-from typing import Tuple, Union
-
 import torch

 from diffusers import DiffusionPipeline, ImagePipelineOutput, SchedulerMixin, UNet2DModel
@@ -47,7 +45,7 @@ class CustomLocalPipeline(DiffusionPipeline):
        output_type: str | None = "pil",
        return_dict: bool = True,
        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
+    ) -> ImagePipelineOutput | tuple:
        r"""
        Args:
            batch_size (`int`, *optional*, defaults to 1):
--- a/tests/fixtures/custom_pipeline/what_ever.py
+++ b/tests/fixtures/custom_pipeline/what_ever.py
@@ -15,8 +15,6 @@
 # limitations under the License.


-from typing import Tuple, Union
-
 import torch

 from diffusers import SchedulerMixin, UNet2DModel
@@ -48,7 +46,7 @@ class CustomLocalPipeline(DiffusionPipeline):
        output_type: str | None = "pil",
        return_dict: bool = True,
        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
+    ) -> ImagePipelineOutput | tuple:
        r"""
        Args:
            batch_size (`int`, *optional*, defaults to 1):
--- a/tests/models/test_modeling_common.py
+++ b/tests/models/test_modeling_common.py
@@ -26,7 +26,7 @@ import unittest
 import unittest.mock as mock
 import uuid
 from collections import defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple

 import numpy as np
 import pytest
@@ -168,8 +168,8 @@ def named_persistent_module_tensors(

 def compute_module_persistent_sizes(
    model: nn.Module,
-    dtype: Optional[Union[str, torch.device]] = None,
-    special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
+    dtype: str | torch.device | None = None,
+    special_dtypes: dict[str, str | torch.device] | None = None,
 ):
    """
    Compute the size of each submodule of a given model (parameters + persistent buffers).
--- a/tests/modular_pipelines/test_modular_pipelines_common.py
+++ b/tests/modular_pipelines/test_modular_pipelines_common.py
@@ -1,6 +1,6 @@
 import gc
 import tempfile
-from typing import Callable, Union
+from typing import Callable

 import pytest
 import torch
@@ -36,7 +36,7 @@ class ModularPipelineTesterMixin:
        return generator

    @property
-    def pipeline_class(self) -> Union[Callable, ModularPipeline]:
+    def pipeline_class(self) -> Callable | ModularPipeline:
        raise NotImplementedError(
            "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
            "See existing pipeline tests for reference."
@@ -49,7 +49,7 @@ class ModularPipelineTesterMixin:
        )

    @property
-    def pipeline_blocks_class(self) -> Union[Callable, ModularPipelineBlocks]:
+    def pipeline_blocks_class(self) -> Callable | ModularPipelineBlocks:
        raise NotImplementedError(
            "You need to set the attribute `pipeline_blocks_class = ClassNameOfPipelineBlocks` in the child test class. "
            "See existing pipeline tests for reference."
--- a/tests/others/test_outputs.py
+++ b/tests/others/test_outputs.py
@@ -1,7 +1,6 @@
 import pickle as pkl
 import unittest
 from dataclasses import dataclass
-from typing import List, Union

 import numpy as np
 import PIL.Image
@@ -13,7 +12,7 @@ from ..testing_utils import require_torch

@dataclass
 class CustomOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
+    images: list[PIL.Image.Image] | np.ndarray


 class ConfigTester(unittest.TestCase):
--- a/tests/pipelines/cosmos/cosmos_guardrail.py
+++ b/tests/pipelines/cosmos/cosmos_guardrail.py
@@ -14,7 +14,6 @@

 # ===== This file is an implementation of a dummy guardrail for the fast tests =====

-from typing import Union

 import numpy as np
 import torch
@@ -35,7 +34,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
    def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
        return frames

-    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
+    def to(self, device: str | torch.device = None, dtype: torch.dtype = None):
        module = super().to(device=device, dtype=dtype)
        return module

--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -5,7 +5,7 @@ import os
 import tempfile
 import unittest
 import uuid
-from typing import Any, Callable, Dict, Union
+from typing import Any, Callable, Dict

 import numpy as np
 import PIL.Image
@@ -1071,7 +1071,7 @@ class PipelineTesterMixin:
        return generator

    @property
-    def pipeline_class(self) -> Union[Callable, DiffusionPipeline]:
+    def pipeline_class(self) -> Callable | DiffusionPipeline:
        raise NotImplementedError(
            "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
            "See existing pipeline tests for reference."
--- a/tests/remote/test_remote_decode.py
+++ b/tests/remote/test_remote_decode.py
@@ -14,7 +14,6 @@
 # limitations under the License.

 import unittest
-from typing import Tuple, Union

 import numpy as np
 import PIL.Image
@@ -44,13 +43,13 @@ enable_full_determinism()


 class RemoteAutoencoderKLMixin:
-    shape: Tuple[int, ...] = None
-    out_hw: Tuple[int, int] = None
+    shape: tuple[int, ...] = None
+    out_hw: tuple[int, int] = None
    endpoint: str = None
    dtype: torch.dtype = None
    scaling_factor: float = None
    shift_factor: float = None
-    processor_cls: Union[VaeImageProcessor, VideoProcessor] = None
+    processor_cls: VaeImageProcessor | VideoProcessor = None
    output_pil_slice: torch.Tensor = None
    output_pt_slice: torch.Tensor = None
    partial_postprocess_return_pt_slice: torch.Tensor = None