1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00
This commit is contained in:
sayakpaul
2026-01-15 08:50:35 +05:30
parent 7ad97d492d
commit 765eb50ff1
12 changed files with 88 additions and 97 deletions

View File

@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any
import torch
import torch.nn as nn
@@ -104,7 +104,7 @@ class GlmImageAdaLayerNormZero(nn.Module):
def forward(
self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
) -> tuple[torch.Tensor, torch.Tensor]:
dtype = hidden_states.dtype
norm_hidden_states = self.norm(hidden_states).to(dtype=dtype)
norm_encoder_hidden_states = self.norm_context(encoder_hidden_states).to(dtype=dtype)
@@ -148,7 +148,7 @@ class GlmImageLayerKVCache:
def __init__(self):
self.k_cache = None
self.v_cache = None
self.mode: Optional[str] = None # "write", "read", "skip"
self.mode: str | None = None # "write", "read", "skip"
def store(self, k: torch.Tensor, v: torch.Tensor):
if self.k_cache is None:
@@ -186,7 +186,7 @@ class GlmImageKVCache:
def __getitem__(self, layer_idx: int) -> GlmImageLayerKVCache:
return self.caches[layer_idx]
def set_mode(self, mode: Optional[str]):
def set_mode(self, mode: str):
if mode is not None and mode not in ["write", "read", "skip"]:
raise ValueError(f"Invalid mode: {mode}, must be one of 'write', 'read', 'skip'")
for cache in self.caches:
@@ -218,10 +218,10 @@ class GlmImageAttnProcessor:
attn: Attention,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
kv_cache: Optional[GlmImageLayerKVCache] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
attention_mask: torch.Tensor | None = None,
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
kv_cache: GlmImageLayerKVCache | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
dtype = encoder_hidden_states.dtype
batch_size, text_seq_length, embed_dim = encoder_hidden_states.shape
@@ -330,14 +330,12 @@ class GlmImageTransformerBlock(nn.Module):
self,
hidden_states: torch.Tensor,
encoder_hidden_states: torch.Tensor,
temb: Optional[torch.Tensor] = None,
image_rotary_emb: Optional[
Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
] = None,
attention_mask: Optional[Dict[str, torch.Tensor]] = None,
attention_kwargs: Optional[Dict[str, Any]] = None,
kv_cache: Optional[GlmImageLayerKVCache] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
temb: torch.Tensor | None = None,
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None,
attention_mask: dict[str, torch.Tensor] | None = None,
attention_kwargs: dict[str, Any] | None = None,
kv_cache: GlmImageLayerKVCache | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
# 1. Timestep conditioning
(
norm_hidden_states,
@@ -388,7 +386,7 @@ class GlmImageRotaryPosEmbed(nn.Module):
self.patch_size = patch_size
self.theta = theta
def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
batch_size, num_channels, height, width = hidden_states.shape
height, width = height // self.patch_size, width // self.patch_size
@@ -553,14 +551,12 @@ class GlmImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach
timestep: torch.LongTensor,
target_size: torch.Tensor,
crop_coords: torch.Tensor,
attention_kwargs: Optional[Dict[str, Any]] = None,
attention_kwargs: dict[str, Any] | None = None,
return_dict: bool = True,
attention_mask: Optional[torch.Tensor] = None,
kv_caches: Optional[GlmImageKVCache] = None,
image_rotary_emb: Optional[
Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
] = None,
) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
attention_mask: torch.Tensor | None = None,
kv_caches: GlmImageKVCache | None = None,
image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | list[tuple[torch.Tensor, torch.Tensor]] | None = None,
) -> tuple[torch.Tensor] | Transformer2DModelOutput:
batch_size, num_channels, height, width = hidden_states.shape
# 1. RoPE

View File

@@ -15,7 +15,7 @@
import inspect
import re
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Callable
import numpy as np
import PIL
@@ -79,10 +79,10 @@ def calculate_shift(
# Copied from diffusers.pipelines.cogview4.pipeline_cogview4.retrieve_timesteps
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
num_inference_steps: int | None = None,
device: str | torch.device | None = None,
timesteps: list[int] | None = None,
sigmas: list[float] | None = None,
**kwargs,
):
r"""
@@ -97,10 +97,10 @@ def retrieve_timesteps(
must be `None`.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
timesteps (`list[int]`, *optional*):
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
`num_inference_steps` and `sigmas` must be `None`.
sigmas (`List[float]`, *optional*):
sigmas (`list[float]`, *optional*):
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
`num_inference_steps` and `timesteps` must be `None`.
@@ -146,7 +146,7 @@ def retrieve_timesteps(
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
return encoder_output.latent_dist.sample(generator)
@@ -265,8 +265,8 @@ class GlmImagePipeline(DiffusionPipeline):
prompt: str,
height: int,
width: int,
image: Optional[List[PIL.Image.Image]] = None,
device: Optional[torch.device] = None,
image: list[PIL.Image.Image] | None = None,
device: torch.device | None = None,
):
device = device or self._execution_device
is_text_to_image = image is None or len(image) == 0
@@ -327,10 +327,10 @@ class GlmImagePipeline(DiffusionPipeline):
def _get_glyph_embeds(
self,
prompt: Union[str, List[str]] = None,
prompt: str | list[str] = None,
max_sequence_length: int = 2048,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
device: torch.device | None = None,
dtype: torch.dtype | None = None,
):
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
@@ -359,20 +359,20 @@ class GlmImagePipeline(DiffusionPipeline):
def encode_prompt(
self,
prompt: Union[str, List[str]],
prompt: str | list[str],
do_classifier_free_guidance: bool = True,
num_images_per_prompt: int = 1,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
prompt_embeds: torch.Tensor | None = None,
negative_prompt_embeds: torch.Tensor | None = None,
device: torch.device | None = None,
dtype: torch.dtype | None = None,
max_sequence_length: int = 2048,
):
r"""
Encodes the prompt into text encoder hidden states.
Args:
prompt (`str` or `List[str]`, *optional*):
prompt (`str` or `list[str]`, *optional*):
prompt to be encoded
do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
Whether to use classifier free guidance or not.
@@ -527,40 +527,43 @@ class GlmImagePipeline(DiffusionPipeline):
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
prompt: Optional[Union[str, List[str]]] = None,
image: Optional[
Union[
torch.Tensor, PIL.Image.Image, np.ndarray, List[torch.Tensor], List[PIL.Image.Image], List[np.ndarray]
]
] = None,
height: Optional[int] = None,
width: Optional[int] = None,
prompt: str | list[str] | None = None,
image: torch.Tensor
| PIL.Image.Image
| np.ndarray
| list[torch.Tensor]
| list[PIL.Image.Image]
| list[np.ndarray]
| None = None,
height: int | None = None,
width: int | None = None,
num_inference_steps: int = 50,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
timesteps: list[int] | None = None,
sigmas: list[float] | None = None,
guidance_scale: float = 1.5,
num_images_per_prompt: int = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
prior_token_ids: Optional[torch.FloatTensor] = None,
prior_image_token_ids: Optional[torch.Tensor] = None,
crops_coords_top_left: Tuple[int, int] = (0, 0),
generator: torch.Generator | list[torch.Generator] | None = None,
latents: torch.FloatTensor | None = None,
prompt_embeds: torch.Tensor | None = None,
negative_prompt_embeds: torch.Tensor | None = None,
prior_token_ids: torch.FloatTensor | None = None,
prior_image_token_ids: torch.Tensor | None = None,
crops_coords_top_left: tuple[int, int] = (0, 0),
output_type: str = "pil",
return_dict: bool = True,
attention_kwargs: Optional[Dict[str, Any]] = None,
callback_on_step_end: Optional[
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
attention_kwargs: dict[str, Any] | None = None,
callback_on_step_end: Callable[[int, int, dict], None]
| PipelineCallback
| MultiPipelineCallbacks
| None = None,
callback_on_step_end_tensor_inputs: list[str] = ["latents"],
max_sequence_length: int = 2048,
) -> Union[GlmImagePipelineOutput, Tuple]:
) -> GlmImagePipelineOutput | tuple:
"""
Function invoked when calling the pipeline for generation.
Args:
prompt (`str` or `List[str]`, *optional*):
prompt (`str` or `list[str]`, *optional*):
The prompt or prompts to guide the image generation. Must contain shape info in the format '<sop>H
W<eop>' where H and W are token dimensions (d32). Example: "A beautiful sunset<sop>36 24<eop>"
generates a 1152x768 image.

View File

@@ -1,5 +1,4 @@
from dataclasses import dataclass
from typing import List, Union
import numpy as np
import PIL.Image
@@ -18,4 +17,4 @@ class GlmImagePipelineOutput(BaseOutput):
num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
images: list[PIL.Image.Image] | np.ndarray

View File

@@ -128,7 +128,7 @@ class ImagePipelineOutput(BaseOutput):
num_channels)`.
"""
images: Union[List[PIL.Image.Image], np.ndarray]
images: list[PIL.Image.Image] | np.ndarray
@dataclass
@@ -1171,7 +1171,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
accelerate.hooks.remove_hook_from_module(model, recurse=True)
self._all_hooks = []
def enable_model_cpu_offload(self, gpu_id: int | None = None, device: Union[torch.device, str] = None):
def enable_model_cpu_offload(self, gpu_id: int | None = None, device: torch.device | str = None):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
@@ -1289,7 +1289,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
# make sure the model is in the same state as before calling it
self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))
def enable_sequential_cpu_offload(self, gpu_id: int | None = None, device: Union[torch.device, str] = None):
def enable_sequential_cpu_offload(self, gpu_id: int | None = None, device: torch.device | str = None):
r"""
Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
@@ -1498,7 +1498,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
@classmethod
@validate_hf_hub_args
def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
def download(cls, pretrained_model_name, **kwargs) -> str | os.PathLike:
r"""
Download and cache a PyTorch diffusion pipeline from pretrained pipeline weights.
@@ -1880,7 +1880,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
return signature_types
@property
def parameters(self) -> Dict[str, Any]:
def parameters(self) -> dict[str, Any]:
r"""
The `self.parameters` property can be useful to run different pipelines with the same weights and
configurations without reallocating additional memory.
@@ -1910,7 +1910,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
return pipeline_parameters
@property
def components(self) -> Dict[str, Any]:
def components(self) -> dict[str, Any]:
r"""
The `self.components` property can be useful to run different pipelines with the same weights and
configurations without reallocating additional memory.

View File

@@ -15,8 +15,6 @@
# limitations under the License.
from typing import Tuple, Union
import torch
from diffusers import DiffusionPipeline, ImagePipelineOutput, SchedulerMixin, UNet2DModel
@@ -47,7 +45,7 @@ class CustomLocalPipeline(DiffusionPipeline):
output_type: str | None = "pil",
return_dict: bool = True,
**kwargs,
) -> Union[ImagePipelineOutput, Tuple]:
) -> ImagePipelineOutput | tuple:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):

View File

@@ -15,8 +15,6 @@
# limitations under the License.
from typing import Tuple, Union
import torch
from diffusers import SchedulerMixin, UNet2DModel
@@ -48,7 +46,7 @@ class CustomLocalPipeline(DiffusionPipeline):
output_type: str | None = "pil",
return_dict: bool = True,
**kwargs,
) -> Union[ImagePipelineOutput, Tuple]:
) -> ImagePipelineOutput | tuple:
r"""
Args:
batch_size (`int`, *optional*, defaults to 1):

View File

@@ -26,7 +26,7 @@ import unittest
import unittest.mock as mock
import uuid
from collections import defaultdict
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Tuple
import numpy as np
import pytest
@@ -168,8 +168,8 @@ def named_persistent_module_tensors(
def compute_module_persistent_sizes(
model: nn.Module,
dtype: Optional[Union[str, torch.device]] = None,
special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
dtype: str | torch.device | None = None,
special_dtypes: dict[str, str | torch.device] | None = None,
):
"""
Compute the size of each submodule of a given model (parameters + persistent buffers).

View File

@@ -1,6 +1,6 @@
import gc
import tempfile
from typing import Callable, Union
from typing import Callable
import pytest
import torch
@@ -36,7 +36,7 @@ class ModularPipelineTesterMixin:
return generator
@property
def pipeline_class(self) -> Union[Callable, ModularPipeline]:
def pipeline_class(self) -> Callable | ModularPipeline:
raise NotImplementedError(
"You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
"See existing pipeline tests for reference."
@@ -49,7 +49,7 @@ class ModularPipelineTesterMixin:
)
@property
def pipeline_blocks_class(self) -> Union[Callable, ModularPipelineBlocks]:
def pipeline_blocks_class(self) -> Callable | ModularPipelineBlocks:
raise NotImplementedError(
"You need to set the attribute `pipeline_blocks_class = ClassNameOfPipelineBlocks` in the child test class. "
"See existing pipeline tests for reference."

View File

@@ -1,7 +1,6 @@
import pickle as pkl
import unittest
from dataclasses import dataclass
from typing import List, Union
import numpy as np
import PIL.Image
@@ -13,7 +12,7 @@ from ..testing_utils import require_torch
@dataclass
class CustomOutput(BaseOutput):
images: Union[List[PIL.Image.Image], np.ndarray]
images: list[PIL.Image.Image] | np.ndarray
class ConfigTester(unittest.TestCase):

View File

@@ -14,7 +14,6 @@
# ===== This file is an implementation of a dummy guardrail for the fast tests =====
from typing import Union
import numpy as np
import torch
@@ -35,7 +34,7 @@ class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
return frames
def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None):
def to(self, device: str | torch.device = None, dtype: torch.dtype = None):
module = super().to(device=device, dtype=dtype)
return module

View File

@@ -5,7 +5,7 @@ import os
import tempfile
import unittest
import uuid
from typing import Any, Callable, Dict, Union
from typing import Any, Callable, Dict
import numpy as np
import PIL.Image
@@ -1071,7 +1071,7 @@ class PipelineTesterMixin:
return generator
@property
def pipeline_class(self) -> Union[Callable, DiffusionPipeline]:
def pipeline_class(self) -> Callable | DiffusionPipeline:
raise NotImplementedError(
"You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
"See existing pipeline tests for reference."

View File

@@ -14,7 +14,6 @@
# limitations under the License.
import unittest
from typing import Tuple, Union
import numpy as np
import PIL.Image
@@ -44,13 +43,13 @@ enable_full_determinism()
class RemoteAutoencoderKLMixin:
shape: Tuple[int, ...] = None
out_hw: Tuple[int, int] = None
shape: tuple[int, ...] = None
out_hw: tuple[int, int] = None
endpoint: str = None
dtype: torch.dtype = None
scaling_factor: float = None
shift_factor: float = None
processor_cls: Union[VaeImageProcessor, VideoProcessor] = None
processor_cls: VaeImageProcessor | VideoProcessor = None
output_pil_slice: torch.Tensor = None
output_pt_slice: torch.Tensor = None
partial_postprocess_return_pt_slice: torch.Tensor = None