From 478df933c30a107c9bec31afa6706b09c75cfe69 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 18 Jul 2025 08:28:51 +0100 Subject: [PATCH] [docs] clarify the mapping between `Transformer2DModel` and finegrained variants. (#11947) * clarify the mapping between Transformer2DModel and finegrained variants. * Update src/diffusers/pipelines/dit/pipeline_dit.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * fix --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- src/diffusers/pipelines/dit/pipeline_dit.py | 4 +++- .../pixart_alpha/pipeline_pixart_alpha.py | 4 +++- .../pixart_alpha/pipeline_pixart_sigma.py | 20 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/dit/pipeline_dit.py b/src/diffusers/pipelines/dit/pipeline_dit.py index 14f63ea229..68ff6c9b55 100644 --- a/src/diffusers/pipelines/dit/pipeline_dit.py +++ b/src/diffusers/pipelines/dit/pipeline_dit.py @@ -46,7 +46,9 @@ class DiTPipeline(DiffusionPipeline): Parameters: transformer ([`DiTTransformer2DModel`]): - A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents. + A class conditioned `DiTTransformer2DModel` to denoise the encoded image latents. Initially published as + [`Transformer2DModel`](https://huggingface.co/facebook/DiT-XL-2-256/blob/main/transformer/config.json#L2) + in the config, but the mismatch can be ignored. vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. scheduler ([`DDIMScheduler`]): diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py index f7e70c511b..bd69746be3 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py @@ -256,7 +256,9 @@ class PixArtAlphaPipeline(DiffusionPipeline): Tokenizer of class [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). transformer ([`PixArtTransformer2DModel`]): - A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents. + A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents. Initially published as + [`Transformer2DModel`](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS/blob/main/transformer/config.json#L2) + in the config, but the mismatch can be ignored. scheduler ([`SchedulerMixin`]): A scheduler to be used in combination with `transformer` to denoise the encoded image latents. """ diff --git a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py index c3d235d91b..c14036cf94 100644 --- a/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py +++ b/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_sigma.py @@ -185,6 +185,26 @@ def retrieve_timesteps( class PixArtSigmaPipeline(DiffusionPipeline): r""" Pipeline for text-to-image generation using PixArt-Sigma. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`T5EncoderModel`]): + Frozen text-encoder. PixArt-Alpha uses + [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the + [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. + tokenizer (`T5Tokenizer`): + Tokenizer of class + [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). + transformer ([`PixArtTransformer2DModel`]): + A text conditioned `PixArtTransformer2DModel` to denoise the encoded image latents. Initially published as + [`Transformer2DModel`](https://huggingface.co/PixArt-alpha/PixArt-Sigma-XL-2-1024-MS/blob/main/transformer/config.json#L2) + in the config, but the mismatch can be ignored. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. """ bad_punct_regex = re.compile(