From 33a8a3be0cede6e675e62f35ab5a2da3642bca48 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Thu, 18 Sep 2025 14:49:48 +0530 Subject: [PATCH] up --- src/diffusers/models/activations.py | 41 +++++++++------------------ src/diffusers/models/normalization.py | 30 ++++++++++++++++---- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py index 87d181f75f..fe21b87961 100644 --- a/src/diffusers/models/activations.py +++ b/src/diffusers/models/activations.py @@ -17,8 +17,7 @@ import torch import torch.nn.functional as F from torch import nn -from ..utils import deprecate, get_logger, is_kernels_available, is_torch_npu_available, is_torch_version -from ..utils.constants import DIFFUSERS_ENABLE_HUB_KERNELS +from ..utils import deprecate, get_logger, is_torch_npu_available, is_torch_version logger = get_logger(__name__) @@ -93,36 +92,24 @@ class GELU(nn.Module): return hidden_states -class CUDAOptimizedGELU(nn.Module): - def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True): - if not torch.cuda.is_available(): - raise NotImplementedError(f"{self.__class__.__name__} is implemented only for CUDA devices.") - if not DIFFUSERS_ENABLE_HUB_KERNELS: - raise RuntimeError( - f"{self.__class__.__name__} isn't usable because the `DIFFUSERS_ENABLE_HUB_KERNELS` env var isn't set. Please set it like `export DIFFUSERS_ENABLE_HUB_KERNELS=yes`." - ) - if not is_kernels_available(): - raise NotImplementedError( - f"{self.__class__.__name__} requires the `kernels` library to be installed. Install it with `pip install kernels`." - ) - +# TODO: validation checks / consider making Python classes of activations like `transformers` +# All of these are temporary for now. +class CUDAOptimizedGELU(GELU): + def __init__(self, *args, **kwargs): from kernels import get_kernel - super().__init__() - self.proj = nn.Linear(dim_in, dim_out, bias=bias) - activations = get_kernel(KERNELS_REPO_ID) - if approximate == "tanh": - self.act = activations.gelu_tanh_and_mul - elif approximate == "none": - self.act = activations.gelu_and_mul - else: - raise NotImplementedError + activation = get_kernel("kernels-community/activation", revision="add_more_act") + approximate = kwargs.get("approximate", "none") + if approximate == "none": + self.act_fn = activation.gelu + elif approximate == "tanh": + self.act_fn = activation.gelu_tanh + super().__init__(*args, **kwargs) def forward(self, hidden_states): hidden_states = self.proj(hidden_states) - out = torch.empty_like(hidden_states) - output = self.act(out, hidden_states) - return output + hidden_states = self.act_fn(hidden_states) + return hidden_states class GEGLU(nn.Module): diff --git a/src/diffusers/models/normalization.py b/src/diffusers/models/normalization.py index 1d92dd5c60..8e9147993b 100644 --- a/src/diffusers/models/normalization.py +++ b/src/diffusers/models/normalization.py @@ -20,12 +20,20 @@ import torch import torch.nn as nn import torch.nn.functional as F -from ..utils import is_torch_npu_available, is_torch_version +from ..utils import is_kernels_available, is_torch_npu_available, is_torch_version +from ..utils.constants import DIFFUSERS_ENABLE_HUB_KERNELS from ..utils.kernels_utils import use_kernel_forward_from_hub from .activations import get_activation from .embeddings import CombinedTimestepLabelEmbeddings, PixArtAlphaCombinedTimestepSizeEmbeddings +if is_kernels_available() and DIFFUSERS_ENABLE_HUB_KERNELS: + from kernels import get_kernel + + activation = get_kernel("kernels-community/activation", revision="add_more_act") + silu_kernel = activation.silu + + class AdaLayerNorm(nn.Module): r""" Norm layer modified to incorporate timestep embeddings. @@ -58,7 +66,10 @@ class AdaLayerNorm(nn.Module): else: self.emb = None - self.silu = nn.SiLU() + if not DIFFUSERS_ENABLE_HUB_KERNELS: + self.silu = nn.SiLU() + else: + self.silu = silu_kernel self.linear = nn.Linear(embedding_dim, output_dim) self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine) @@ -145,7 +156,10 @@ class AdaLayerNormZero(nn.Module): else: self.emb = None - self.silu = nn.SiLU() + if not DIFFUSERS_ENABLE_HUB_KERNELS: + self.silu = nn.SiLU() + else: + self.silu = silu_kernel self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=bias) if norm_type == "layer_norm": self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) @@ -184,7 +198,10 @@ class AdaLayerNormZeroSingle(nn.Module): def __init__(self, embedding_dim: int, norm_type="layer_norm", bias=True): super().__init__() - self.silu = nn.SiLU() + if not DIFFUSERS_ENABLE_HUB_KERNELS: + self.silu = nn.SiLU() + else: + self.silu = silu_kernel self.linear = nn.Linear(embedding_dim, 3 * embedding_dim, bias=bias) if norm_type == "layer_norm": self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) @@ -336,7 +353,10 @@ class AdaLayerNormContinuous(nn.Module): norm_type="layer_norm", ): super().__init__() - self.silu = nn.SiLU() + if not DIFFUSERS_ENABLE_HUB_KERNELS: + self.silu = nn.SiLU() + else: + self.silu = silu_kernel self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) if norm_type == "layer_norm": self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)